release 2013.05.04

Work around a Python bug on Windows with UTF-8 configuration (#820 )
Fix comedycentral: newest
2026-04-23 20:36:55 +00:00 · 2013-05-04 07:12:33 +02:00 · 2013-05-04 07:09:50 +02:00 · 2013-05-04 02:53:26 +02:00 · 2013-05-03 20:08:16 +02:00 · 2013-05-03 20:07:35 +02:00
4 changed files with 179 additions and 213 deletions
--- a/test/tests.json
+++ b/test/tests.json
@@ -112,7 +112,7 @@
  {
    "name": "Escapist",
    "url": "http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate",
-    "file": "6618-Breaking-Down-Baldurs-Gate.flv",
+    "file": "6618-Breaking-Down-Baldurs-Gate.mp4",
    "md5": "c6793dbda81388f4264c1ba18684a74d",
    "skip": "Fails with timeout on Travis"
  },
@@ -344,7 +344,7 @@
    "file": "17258355236.mp4",
    "md5": "7c6a514d691b034ccf8567999e9e88a3",
    "info_dict": {
-        "title": "A sample video from LeeAnn. (If you need an idea..."
+        "title": "Calling all Pris! - A sample video from LeeAnn. (If you need an idea..."
    }
  },
  {
@@ -394,6 +394,23 @@
        }
      }
    ]
+  },
+  {
+    "name":"Bandcamp",
+    "url":"http://youtube-dl.bandcamp.com/track/youtube-dl-test-song",
+    "file":"1812978515.mp3",
+    "md5":"cdeb30cdae1921719a3cbcab696ef53c",
+    "info_dict": {
+      "title":"youtube-dl test song \"'/\\ä↭"
+    }
+  },
+  {
+    "name": "RedTube",
+    "url": "http://www.redtube.com/66418",
+    "file": "66418.mp4",
+    "md5": "7b8c22b5e7098a3e1c09709df1126d2d",
+    "info_dict":{
+      "title":"Sucked on a toilet"
+    }
  }
-
 ]
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -124,8 +124,8 @@ class InfoExtractor(object):
                errnote = u'Unable to download webpage'
            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])

-    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
-        """ Returns the data of the page as a string """
+    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
+        """ Returns a tuple (page content as string, URL handle) """
        urlh = self._request_webpage(url_or_request, video_id, note, errnote)
        content_type = urlh.headers.get('Content-Type', '')
        m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@ -142,7 +142,12 @@ class InfoExtractor(object):
            self.to_screen(u'Dumping request to ' + url)
            dump = base64.b64encode(webpage_bytes).decode('ascii')
            self._downloader.to_screen(dump)
-        return webpage_bytes.decode(encoding, 'replace')
+        content = webpage_bytes.decode(encoding, 'replace')
+        return (content, urlh)
+
+    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
+        """ Returns the data of the page as a string """
+        return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]

    def to_screen(self, msg):
        """Print msg to screen, prefixing it with '[ie_name]'"""
@@ -779,7 +784,6 @@ class MetacafeIE(InfoExtractor):
            'ext':      video_extension.decode('utf-8'),
        }]

-
 class DailymotionIE(InfoExtractor):
    """Information Extractor for Dailymotion"""

@@ -1069,13 +1073,7 @@ class VimeoIE(InfoExtractor):

        # Retrieve video webpage to extract further information
        request = compat_urllib_request.Request(url, None, std_headers)
-        try:
-            self.report_download_webpage(video_id)
-            webpage_bytes = compat_urllib_request.urlopen(request).read()
-            webpage = webpage_bytes.decode('utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
-            return
+        webpage = self._download_webpage(request, video_id)

        # Now we begin extracting as much information as we can from what we
        # retrieved. First we extract the information common to all extractors,
@@ -1682,10 +1680,6 @@ class YoutubePlaylistIE(InfoExtractor):
        """Receives a URL and returns True if suitable for this IE."""
        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None

-    def report_download_page(self, playlist_id, pagenum):
-        """Report attempt to download playlist page with given number."""
-        self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
-
    def _real_extract(self, url):
        # Extract playlist id
        mobj = re.match(self._VALID_URL, url, re.VERBOSE)
@@ -1699,14 +1693,8 @@ class YoutubePlaylistIE(InfoExtractor):
        videos = []

        while True:
-            self.report_download_page(playlist_id, page_num)
-
            url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
-            try:
-                page = compat_urllib_request.urlopen(url).read().decode('utf8')
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
-                return
+            page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)

            try:
                response = json.loads(page)
@@ -1745,10 +1733,6 @@ class YoutubeChannelIE(InfoExtractor):
    _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
    IE_NAME = u'youtube:channel'

-    def report_download_page(self, channel_id, pagenum):
-        """Report attempt to download channel page with given number."""
-        self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
-
    def extract_videos_from_page(self, page):
        ids_in_page = []
        for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
@@ -1768,14 +1752,9 @@ class YoutubeChannelIE(InfoExtractor):
        video_ids = []
        pagenum = 1

-        self.report_download_page(channel_id, pagenum)
        url = self._TEMPLATE_URL % (channel_id, pagenum)
-        request = compat_urllib_request.Request(url)
-        try:
-            page = compat_urllib_request.urlopen(request).read().decode('utf8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
-            return
+        page = self._download_webpage(url, channel_id,
+                                      u'Downloading page #%s' % pagenum)

        # Extract video identifiers
        ids_in_page = self.extract_videos_from_page(page)
@@ -1786,14 +1765,9 @@ class YoutubeChannelIE(InfoExtractor):
            while True:
                pagenum = pagenum + 1

-                self.report_download_page(channel_id, pagenum)
                url = self._MORE_PAGES_URL % (pagenum, channel_id)
-                request = compat_urllib_request.Request(url)
-                try:
-                    page = compat_urllib_request.urlopen(request).read().decode('utf8')
-                except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                    self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
-                    return
+                page = self._download_webpage(url, channel_id,
+                                              u'Downloading page #%s' % pagenum)

                page = json.loads(page)

@@ -1820,11 +1794,6 @@ class YoutubeUserIE(InfoExtractor):
    _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
    IE_NAME = u'youtube:user'

-    def report_download_page(self, username, start_index):
-        """Report attempt to download user page."""
-        self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
-                (username, start_index, start_index + self._GDATA_PAGE_SIZE))
-
    def _real_extract(self, url):
        # Extract username
        mobj = re.match(self._VALID_URL, url)
@@ -1844,15 +1813,10 @@ class YoutubeUserIE(InfoExtractor):

        while True:
            start_index = pagenum * self._GDATA_PAGE_SIZE + 1
-            self.report_download_page(username, start_index)

-            request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
-
-            try:
-                page = compat_urllib_request.urlopen(request).read().decode('utf-8')
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
-                return
+            gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
+            page = self._download_webpage(gdata_url, username,
+                                          u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))

            # Extract video identifiers
            ids_in_page = []
@@ -1886,11 +1850,6 @@ class BlipTVUserIE(InfoExtractor):
    _PAGE_SIZE = 12
    IE_NAME = u'blip.tv:user'

-    def report_download_page(self, username, pagenum):
-        """Report attempt to download user page."""
-        self.to_screen(u'user %s: Downloading video ids from page %d' %
-                (username, pagenum))
-
    def _real_extract(self, url):
        # Extract username
        mobj = re.match(self._VALID_URL, url)
@@ -1902,15 +1861,9 @@ class BlipTVUserIE(InfoExtractor):

        page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'

-        request = compat_urllib_request.Request(url)
-
-        try:
-            page = compat_urllib_request.urlopen(request).read().decode('utf-8')
-            mobj = re.search(r'data-users-id="([^"]+)"', page)
-            page_base = page_base % mobj.group(1)
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
-            return
+        page = self._download_webpage(url, username, u'Downloading user page')
+        mobj = re.search(r'data-users-id="([^"]+)"', page)
+        page_base = page_base % mobj.group(1)


        # Download video ids using BlipTV Ajax calls. Result size per
@@ -1922,14 +1875,9 @@ class BlipTVUserIE(InfoExtractor):
        pagenum = 1

        while True:
-            self.report_download_page(username, pagenum)
            url = page_base + "&page=" + str(pagenum)
-            request = compat_urllib_request.Request( url )
-            try:
-                page = compat_urllib_request.urlopen(request).read().decode('utf-8')
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                self._downloader.report_error(u'unable to download webpage: %s' % str(err))
-                return
+            page = self._download_webpage(url, username,
+                                          u'Downloading video ids from page %d' % pagenum)

            # Extract video identifiers
            ids_in_page = []
@@ -2288,12 +2236,6 @@ class ComedyCentralIE(InfoExtractor):
        """Receives a URL and returns True if suitable for this IE."""
        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None

-    def report_config_download(self, episode_id, media_id):
-        self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
-
-    def report_index_download(self, episode_id):
-        self.to_screen(u'%s: Downloading show index' % episode_id)
-
    def _print_formats(self, formats):
        print('Available formats:')
        for x in formats:
@@ -2327,24 +2269,15 @@ class ComedyCentralIE(InfoExtractor):
            else:
                epTitle = mobj.group('episode')

-        req = compat_urllib_request.Request(url)
        self.report_extraction(epTitle)
-        try:
-            htmlHandle = compat_urllib_request.urlopen(req)
-            html = htmlHandle.read()
-            webpage = html.decode('utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
-            return
+        webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
        if dlNewest:
            url = htmlHandle.geturl()
            mobj = re.match(self._VALID_URL, url, re.VERBOSE)
            if mobj is None:
-                self._downloader.report_error(u'Invalid redirected URL: ' + url)
-                return
+                raise ExtractorError(u'Invalid redirected URL: ' + url)
            if mobj.group('episode') == '':
-                self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
-                return
+                raise ExtractorError(u'Redirected URL is still not specific: ' + url)
            epTitle = mobj.group('episode')

        mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
@@ -2356,19 +2289,15 @@ class ComedyCentralIE(InfoExtractor):

            altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
            if len(altMovieParams) == 0:
-                self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
-                return
+                raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
            else:
                mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]

        uri = mMovieParams[0][1]
        indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
-        self.report_index_download(epTitle)
-        try:
-            indexXml = compat_urllib_request.urlopen(indexUrl).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
-            return
+        indexXml = self._download_webpage(indexUrl, epTitle,
+                                          u'Downloading show index',
+                                          u'unable to download episode index')

        results = []

@@ -2383,13 +2312,8 @@ class ComedyCentralIE(InfoExtractor):

            configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
                        compat_urllib_parse.urlencode({'uri': mediaId}))
-            configReq = compat_urllib_request.Request(configUrl)
-            self.report_config_download(epTitle, shortMediaId)
-            try:
-                configXml = compat_urllib_request.urlopen(configReq).read()
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
-                return
+            configXml = self._download_webpage(configUrl, epTitle,
+                                               u'Downloading configuration for %s' % shortMediaId)

            cdoc = xml.etree.ElementTree.fromstring(configXml)
            turls = []
@@ -2446,9 +2370,6 @@ class EscapistIE(InfoExtractor):
    _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
    IE_NAME = u'escapist'

-    def report_config_download(self, showName):
-        self.to_screen(u'%s: Downloading configuration' % showName)
-
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        if mobj is None:
@@ -2458,14 +2379,7 @@ class EscapistIE(InfoExtractor):
        videoId = mobj.group('episode')

        self.report_extraction(showName)
-        try:
-            webPage = compat_urllib_request.urlopen(url)
-            webPageBytes = webPage.read()
-            m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
-            webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
-            return
+        webPage = self._download_webpage(url, showName)

        descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
        description = unescapeHTML(descMatch.group(1))
@@ -2476,14 +2390,9 @@ class EscapistIE(InfoExtractor):
        configUrlMatch = re.search('config=(.*)$', playerUrl)
        configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))

-        self.report_config_download(showName)
-        try:
-            configJSON = compat_urllib_request.urlopen(configUrl)
-            m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
-            configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
-            return
+        configJSON = self._download_webpage(configUrl, showName,
+                                            u'Downloading configuration',
+                                            u'unable to download configuration')

        # Technically, it's JavaScript, not JSON
        configJSON = configJSON.replace("'", '"')
@@ -2661,31 +2570,22 @@ class SoundcloudIE(InfoExtractor):
        # extract simple title (uploader + slug of song title)
        slug_title =  mobj.group(2)
        simple_title = uploader + u'-' + slug_title
+        full_title = '%s/%s' % (uploader, slug_title)

-        self.report_resolve('%s/%s' % (uploader, slug_title))
+        self.report_resolve(full_title)

        url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
        resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
-        request = compat_urllib_request.Request(resolv_url)
-        try:
-            info_json_bytes = compat_urllib_request.urlopen(request).read()
-            info_json = info_json_bytes.decode('utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
-            return
+        info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')

        info = json.loads(info_json)
        video_id = info['id']
-        self.report_extraction('%s/%s' % (uploader, slug_title))
+        self.report_extraction(full_title)

        streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
-        request = compat_urllib_request.Request(streams_url)
-        try:
-            stream_json_bytes = compat_urllib_request.urlopen(request).read()
-            stream_json = stream_json_bytes.decode('utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
-            return
+        stream_json = self._download_webpage(streams_url, full_title,
+                                             u'Downloading stream definitions',
+                                             u'unable to download stream definitions')

        streams = json.loads(stream_json)
        mediaURL = streams['http_mp3_128_url']
@@ -2728,18 +2628,13 @@ class SoundcloudSetIE(InfoExtractor):
        # extract simple title (uploader + slug of song title)
        slug_title =  mobj.group(2)
        simple_title = uploader + u'-' + slug_title
+        full_title = '%s/sets/%s' % (uploader, slug_title)

-        self.report_resolve('%s/sets/%s' % (uploader, slug_title))
+        self.report_resolve(full_title)

        url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
        resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
-        request = compat_urllib_request.Request(resolv_url)
-        try:
-            info_json_bytes = compat_urllib_request.urlopen(request).read()
-            info_json = info_json_bytes.decode('utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
-            return
+        info_json = self._download_webpage(resolv_url, full_title)

        videos = []
        info = json.loads(info_json)
@@ -2748,19 +2643,14 @@ class SoundcloudSetIE(InfoExtractor):
                self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
            return

+        self.report_extraction(full_title)
        for track in info['tracks']:
            video_id = track['id']
-            self.report_extraction('%s/sets/%s' % (uploader, slug_title))

            streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
-            request = compat_urllib_request.Request(streams_url)
-            try:
-                stream_json_bytes = compat_urllib_request.urlopen(request).read()
-                stream_json = stream_json_bytes.decode('utf-8')
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
-                return
+            stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')

+            self.report_extraction(video_id)
            streams = json.loads(stream_json)
            mediaURL = streams['http_mp3_128_url']

@@ -3155,18 +3045,11 @@ class YoukuIE(InfoExtractor):

        info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id

-        request = compat_urllib_request.Request(info_url, None, std_headers)
-        try:
-            self.report_download_webpage(video_id)
-            jsondata = compat_urllib_request.urlopen(request).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
-            return
+        jsondata = self._download_webpage(info_url, video_id)

        self.report_extraction(video_id)
        try:
-            jsonstr = jsondata.decode('utf-8')
-            config = json.loads(jsonstr)
+            config = json.loads(jsondata)

            video_title =  config['data'][0]['title']
            seed = config['data'][0]['seed']
@@ -3234,15 +3117,8 @@ class XNXXIE(InfoExtractor):
            return
        video_id = mobj.group(1)

-        self.report_download_webpage(video_id)
-
        # Get webpage content
-        try:
-            webpage_bytes = compat_urllib_request.urlopen(url).read()
-            webpage = webpage_bytes.decode('utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'unable to download video webpage: %s' % err)
-            return
+        webpage = self._download_webpage(url, video_id)

        result = re.search(self.VIDEO_URL_RE, webpage)
        if result is None:
@@ -3314,12 +3190,7 @@ class GooglePlusIE(InfoExtractor):

        # Step 1, Retrieve post webpage to extract further information
        self.report_extract_entry(post_url)
-        request = compat_urllib_request.Request(post_url)
-        try:
-            webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
-            return
+        webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')

        # Extract update date
        upload_date = None
@@ -3356,12 +3227,7 @@ class GooglePlusIE(InfoExtractor):
            self._downloader.report_error(u'unable to extract video page URL')

        video_page = mobj.group(1)
-        request = compat_urllib_request.Request(video_page)
-        try:
-            webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
-            return
+        webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
        self.report_extract_vid_page(video_page)


@@ -3448,14 +3314,10 @@ class JustinTVIE(InfoExtractor):
                (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))

    # Return count of items, list of *valid* items
-    def _parse_page(self, url):
-        try:
-            urlh = compat_urllib_request.urlopen(url)
-            webpage_bytes = urlh.read()
-            webpage = webpage_bytes.decode('utf-8', 'ignore')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
-            return
+    def _parse_page(self, url, video_id):
+        webpage = self._download_webpage(url, video_id,
+                                         u'Downloading video info JSON',
+                                         u'unable to download video info JSON')

        response = json.loads(webpage)
        if type(response) != list:
@@ -3507,7 +3369,7 @@ class JustinTVIE(InfoExtractor):
            if paged:
                self.report_download_page(video_id, offset)
            page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
-            page_count, page_info = self._parse_page(page_url)
+            page_count, page_info = self._parse_page(page_url, video_id)
            info.extend(page_info)
            if not paged or page_count != limit:
                break
@@ -3554,7 +3416,7 @@ class FunnyOrDieIE(InfoExtractor):
        return [info]

 class SteamIE(InfoExtractor):
-    _VALID_URL = r"""http://store.steampowered.com/
+    _VALID_URL = r"""http://store\.steampowered\.com/
                (agecheck/)?
                (?P<urltype>video|app)/ #If the page is only for videos or for a game
                (?P<gameID>\d+)/?
@@ -3625,16 +3487,15 @@ class WorldStarHipHopIE(InfoExtractor):
    IE_NAME = u'WorldStarHipHop'

    def _real_extract(self, url):
-        _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
-
-        webpage_src = compat_urllib_request.urlopen(url).read()
-        webpage_src = webpage_src.decode('utf-8')
-
-        mobj = re.search(_src_url, webpage_src)
+        _src_url = r"""(http://(hw-videos|hw-post1).*(?:mp4|flv))"""

        m = re.match(self._VALID_URL, url)
        video_id = m.group('id')

+        webpage_src = self._download_webpage(url, video_id) 
+
+        mobj = re.search(_src_url, webpage_src)
+
        if mobj is not None:
            video_url = mobj.group()
            if 'mp4' in video_url:
@@ -3988,12 +3849,13 @@ class KeekIE(InfoExtractor):
        return [info]

 class TEDIE(InfoExtractor):
-    _VALID_URL=r'''http://www.ted.com/
+    _VALID_URL=r'''http://www\.ted\.com/
                   (
                        ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
                        |
                        ((?P<type_talk>talks)) # We have a simple talk
                   )
+                   (/lang/(.*?))? # The url may contain the language
                   /(?P<name>\w+) # Here goes the name and then ".html"
                   '''

@@ -4244,7 +4106,7 @@ class ARDIE(InfoExtractor):
        return [info]

 class TumblrIE(InfoExtractor):
-    _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
+    _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'

    def _real_extract(self, url):
        m_url = re.match(self._VALID_URL, url)
@@ -4254,7 +4116,7 @@ class TumblrIE(InfoExtractor):
        url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
        webpage = self._download_webpage(url, video_id)

-        re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
+        re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
        video = re.search(re_video, webpage)
        if video is None:
            self.to_screen("No video founded")
@@ -4267,8 +4129,8 @@ class TumblrIE(InfoExtractor):

        # The only place where you can get a title, it's not complete,
        # but searching in other places doesn't work for all videos
-        re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
-        title = unescapeHTML(re.search(re_title, webpage).group('title'))
+        re_title = r'<title>(?P<title>.*?)</title>'
+        title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))

        return [{'id': video_id,
                 'url': video_url,
@@ -4277,6 +4139,83 @@ class TumblrIE(InfoExtractor):
                 'ext': ext
                 }]

+class BandcampIE(InfoExtractor):
+    _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        title = mobj.group('title')
+        webpage = self._download_webpage(url, title)
+        # We get the link to the free download page
+        m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
+        if m_download is None:
+            self._downloader.report_error('No free songs founded')
+            return
+        download_link = m_download.group(1)
+        id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
+                       webpage, re.MULTILINE|re.DOTALL).group('id')
+
+        download_webpage = self._download_webpage(download_link, id,
+                                                  'Downloading free downloads page')
+        # We get the dictionary of the track from some javascrip code
+        info = re.search(r'items: (.*?),$',
+                         download_webpage, re.MULTILINE).group(1)
+        info = json.loads(info)[0]
+        # We pick mp3-320 for now, until format selection can be easily implemented.
+        mp3_info = info[u'downloads'][u'mp3-320']
+        # If we try to use this url it says the link has expired
+        initial_url = mp3_info[u'url']
+        re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
+        m_url = re.match(re_url, initial_url)
+        #We build the url we will use to get the final track url
+        # This url is build in Bandcamp in the script download_bunde_*.js
+        request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
+        final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
+        # If we could correctly generate the .rand field the url would be
+        #in the "download_url" key
+        final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
+
+        track_info = {'id':id,
+                      'title' : info[u'title'],
+                      'ext' : 'mp3',
+                      'url' : final_url,
+                      'thumbnail' : info[u'thumb_url'],
+                      'uploader' : info[u'artist']
+                      }
+
+        return [track_info]
+
+class RedTubeIE(InfoExtractor):
+    """Information Extractor for redtube"""
+    _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
+
+    def _real_extract(self,url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+
+        video_id = mobj.group('id')
+        video_extension = 'mp4'        
+        webpage = self._download_webpage(url, video_id)
+        self.report_extraction(video_id)
+        mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
+
+        if mobj is None:
+            raise ExtractorError(u'Unable to extract media URL')
+
+        video_url = mobj.group(1)
+        mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
+        if mobj is None:
+            raise ExtractorError(u'Unable to extract title')
+        video_title = mobj.group(1)
+
+        return [{
+            'id':       video_id,
+            'url':      video_url,
+            'ext':      video_extension,
+            'title':    video_title,
+        }]
+

 def gen_extractors():
    """ Return a list of an instance of every supported extractor.
@@ -4332,6 +4271,8 @@ def gen_extractors():
        LiveLeakIE(),
        ARDIE(),
        TumblrIE(),
+        BandcampIE(),
+        RedTubeIE(),
        GenericIE()
    ]

--- a/youtube_dl/init.py
+++ b/youtube_dl/init.py
@@ -25,10 +25,12 @@ __authors__  = (
    'Jeff Crouse',
    'Osama Khalid',
    'Michael Walter',
+    'M. Yasoob Ullah Khalid',
    )

 __license__ = 'Public Domain'

+import codecs
 import getpass
 import optparse
 import os
@@ -334,6 +336,11 @@ def parseOpts(overrideArguments=None):
    return parser, opts, args

 def _real_main(argv=None):
+    # Compatibility fixes for Windows
+    if sys.platform == 'win32':
+        # https://github.com/rg3/youtube-dl/issues/820
+        codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None)
+
    parser, opts, args = parseOpts(argv)

    # Open appropriate CookieJar
@@ -388,7 +395,8 @@ def _real_main(argv=None):
        if 'http' in proxies and 'https' not in proxies:
            proxies['https'] = proxies['http']
    proxy_handler = compat_urllib_request.ProxyHandler(proxies)
-    opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
+    https_handler = compat_urllib_request.HTTPSHandler()
+    opener = compat_urllib_request.build_opener(https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
    compat_urllib_request.install_opener(opener)
    socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)

--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@

-__version__ = '2013.05.01'
+__version__ = '2013.05.04'
Author	SHA1	Message	Date
Philipp Hagemeister	1f99511210	release 2013.05.04	2013-05-04 07:12:33 +02:00
Philipp Hagemeister	0d94f2474c	Work around a Python bug on Windows with UTF-8 configuration (#820 )	2013-05-04 07:09:50 +02:00
Philipp Hagemeister	480b6c1e8b	Fix comedycentral: newest	2013-05-04 02:53:26 +02:00
Philipp Hagemeister	95464f14d1	Credit @yasoob for IE	2013-05-03 20:08:16 +02:00
Philipp Hagemeister	c34407d16c	Simplify RedTube	2013-05-03 20:07:35 +02:00
M.Yasoob Ullah Khalid	5e34d2ebbf	Moved redtube info extractor to the end	2013-05-03 23:57:16 +06:00
M.Yasoob Ullah Khalid	815dd2ffa8	Redtube test now works I just did a little makeover by changing redtube tests. Now they are passed.	2013-05-03 23:51:27 +06:00
M.Yasoob Ullah Khalid	ecd5fb49c5	added redtube.com in InfoExtractors (2nd pull request with the required amindments) added redtube.com in InfoExtractors (2nd pull request with the required amindments). Now this script can also download redtube.com videos	2013-05-03 22:44:34 +06:00
M.Yasoob Ullah Khalid	b86174e7a3	added test for redtube.com I just added the test for redtube.com	2013-05-03 22:40:56 +06:00
Jaime Marquínez Ferrándiz	2e2038dc35	TEDIE: report the correct talk title when a link with the language code is given	2013-05-02 18:28:07 +02:00
Jaime Marquínez Ferrándiz	46bfb42258	InfoExtractors: use _download_webpage in more IEs IEs without tests are intact.	2013-05-02 18:18:27 +02:00
Jaime Marquínez Ferrándiz	feecf22511	InfoExtractors: fix some regular expressions where dots weren't escaped	2013-05-02 13:39:56 +02:00
Jaime Marquínez Ferrándiz	4c4f15eb78	Merge pull request #815 from JohnyMoSwag/master Update for new source links on worldstarhiphop.com	2013-05-02 13:23:32 +02:00
Jaime Marquínez Ferrándiz	104ccdb8b4	TumblrIE: fix title matching	2013-05-02 13:12:41 +02:00
Johny Mo Swag	6ccff79594	Small update for additon of new video source links	2013-05-01 20:30:14 -07:00
Jaime Marquínez Ferrándiz	aed523ecc1	Add BandcampIE (closes #568 )	2013-05-01 15:55:46 +02:00