5959logger = logging .getLogger (__name__ )
6060
6161
62- def _get_content_type (url , session ):
63- """Get the Content-Type of the given url, using a HEAD request"""
62+ def _match_vcs_scheme (url ):
63+ """Look for VCS schemes in the URL.
64+
65+ Returns the matched VCS scheme, or None if there's no match.
66+ """
67+ from pip ._internal .vcs import VcsSupport
68+ for scheme in VcsSupport .schemes :
69+ if url .lower ().startswith (scheme ) and url [len (scheme )] in '+:' :
70+ return scheme
71+ return None
72+
73+
74+ def _is_url_like_archive (url ):
75+ """Return whether the URL looks like an archive.
76+ """
77+ filename = Link (url ).filename
78+ for bad_ext in ARCHIVE_EXTENSIONS :
79+ if filename .endswith (bad_ext ):
80+ return True
81+ return False
82+
83+
84+ class _NotHTML (Exception ):
85+ def __init__ (self , content_type , request_desc ):
86+ super (_NotHTML , self ).__init__ (content_type , request_desc )
87+ self .content_type = content_type
88+ self .request_desc = request_desc
89+
90+
91+ def _ensure_html_header (response ):
92+ """Check the Content-Type header to ensure the response contains HTML.
93+
94+ Raises `_NotHTML` if the content type is not text/html.
95+ """
96+ content_type = response .headers .get ("Content-Type" , "" )
97+ if not content_type .lower ().startswith ("text/html" ):
98+ raise _NotHTML (content_type , response .request .method )
99+
100+
101+ class _NotHTTP (Exception ):
102+ pass
103+
104+
105+ def _ensure_html_response (url , session ):
106+ """Send a HEAD request to the URL, and ensure the response contains HTML.
107+
108+ Raises `_NotHTTP` if the URL is not available for a HEAD request, or
109+ `_NotHTML` if the content type is not text/html.
110+ """
64111 scheme , netloc , path , query , fragment = urllib_parse .urlsplit (url )
65112 if scheme not in {'http' , 'https' }:
66- # FIXME: some warning or something?
67- # assertion error?
68- return ''
113+ raise _NotHTTP ()
69114
70115 resp = session .head (url , allow_redirects = True )
71116 resp .raise_for_status ()
72117
73- return resp .headers .get ("Content-Type" , "" )
118+ _ensure_html_header (resp )
119+
120+
121+ def _get_html_response (url , session ):
122+ """Access an HTML page with GET, and return the response.
123+
124+ This consists of three parts:
125+
126+ 1. If the URL looks suspiciously like an archive, send a HEAD first to
127+ check the Content-Type is HTML, to avoid downloading a large file.
128+ Raise `_NotHTTP` if the content type cannot be determined, or
129+ `_NotHTML` if it is not HTML.
130+ 2. Actually perform the request. Raise HTTP exceptions on network failures.
131+ 3. Check the Content-Type header to make sure we got HTML, and raise
132+ `_NotHTML` otherwise.
133+ """
134+ if _is_url_like_archive (url ):
135+ _ensure_html_response (url , session = session )
136+
137+ logger .debug ('Getting page %s' , url )
138+
139+ resp = session .get (
140+ url ,
141+ headers = {
142+ "Accept" : "text/html" ,
143+ # We don't want to blindly returned cached data for
144+ # /simple/, because authors generally expecting that
145+ # twine upload && pip install will function, but if
146+ # they've done a pip install in the last ~10 minutes
147+ # it won't. Thus by setting this to zero we will not
148+ # blindly use any cached data, however the benefit of
149+ # using max-age=0 instead of no-cache, is that we will
150+ # still support conditional requests, so we will still
151+ # minimize traffic sent in cases where the page hasn't
152+ # changed at all, we will just always incur the round
153+ # trip for the conditional GET now instead of only
154+ # once per 10 minutes.
155+ # For more information, please see pypa/pip#5670.
156+ "Cache-Control" : "max-age=0" ,
157+ },
158+ )
159+ resp .raise_for_status ()
160+
161+ # The check for archives above only works if the url ends with
162+ # something that looks like an archive. However that is not a
163+ # requirement of an url. Unless we issue a HEAD request on every
164+ # url we cannot know ahead of time for sure if something is HTML
165+ # or not. However we can check after we've downloaded it.
166+ _ensure_html_header (resp )
167+
168+ return resp
74169
75170
76171def _handle_get_page_fail (link , reason , url , meth = None ):
@@ -85,82 +180,36 @@ def _get_html_page(link, session=None):
85180 "_get_html_page() missing 1 required keyword argument: 'session'"
86181 )
87182
88- url = link .url
89- url = url .split ('#' , 1 )[0 ]
183+ url = link .url .split ('#' , 1 )[0 ]
90184
91185 # Check for VCS schemes that do not support lookup as web pages.
92- from pip ._internal .vcs import VcsSupport
93- for scheme in VcsSupport .schemes :
94- if url .lower ().startswith (scheme ) and url [len (scheme )] in '+:' :
95- logger .debug ('Cannot look at %s URL %s' , scheme , link )
96- return None
186+ vcs_scheme = _match_vcs_scheme (url )
187+ if vcs_scheme :
188+ logger .debug ('Cannot look at %s URL %s' , vcs_scheme , link )
189+ return None
97190
98- try :
99- filename = link .filename
100- for bad_ext in ARCHIVE_EXTENSIONS :
101- if filename .endswith (bad_ext ):
102- content_type = _get_content_type (url , session = session )
103- if content_type .lower ().startswith ('text/html' ):
104- break
105- else :
106- logger .debug (
107- 'Skipping page %s because of Content-Type: %s' ,
108- link ,
109- content_type ,
110- )
111- return
191+ # Tack index.html onto file:// URLs that point to directories
192+ scheme , _ , path , _ , _ , _ = urllib_parse .urlparse (url )
193+ if (scheme == 'file' and os .path .isdir (urllib_request .url2pathname (path ))):
194+ # add trailing slash if not present so urljoin doesn't trim
195+ # final segment
196+ if not url .endswith ('/' ):
197+ url += '/'
198+ url = urllib_parse .urljoin (url , 'index.html' )
199+ logger .debug (' file: URL is directory, getting %s' , url )
112200
113- logger .debug ('Getting page %s' , url )
114-
115- # Tack index.html onto file:// URLs that point to directories
116- (scheme , netloc , path , params , query , fragment ) = \
117- urllib_parse .urlparse (url )
118- if (scheme == 'file' and
119- os .path .isdir (urllib_request .url2pathname (path ))):
120- # add trailing slash if not present so urljoin doesn't trim
121- # final segment
122- if not url .endswith ('/' ):
123- url += '/'
124- url = urllib_parse .urljoin (url , 'index.html' )
125- logger .debug (' file: URL is directory, getting %s' , url )
126-
127- resp = session .get (
128- url ,
129- headers = {
130- "Accept" : "text/html" ,
131- # We don't want to blindly returned cached data for
132- # /simple/, because authors generally expecting that
133- # twine upload && pip install will function, but if
134- # they've done a pip install in the last ~10 minutes
135- # it won't. Thus by setting this to zero we will not
136- # blindly use any cached data, however the benefit of
137- # using max-age=0 instead of no-cache, is that we will
138- # still support conditional requests, so we will still
139- # minimize traffic sent in cases where the page hasn't
140- # changed at all, we will just always incur the round
141- # trip for the conditional GET now instead of only
142- # once per 10 minutes.
143- # For more information, please see pypa/pip#5670.
144- "Cache-Control" : "max-age=0" ,
145- },
201+ try :
202+ resp = _get_html_response (url , session = session )
203+ except _NotHTTP as exc :
204+ logger .debug (
205+ 'Skipping page %s because it looks like an archive, and cannot '
206+ 'be checked by HEAD.' , link ,
207+ )
208+ except _NotHTML as exc :
209+ logger .debug (
210+ 'Skipping page %s because the %s request got Content-Type: %s' ,
211+ link , exc .request_desc , exc .content_type ,
146212 )
147- resp .raise_for_status ()
148-
149- # The check for archives above only works if the url ends with
150- # something that looks like an archive. However that is not a
151- # requirement of an url. Unless we issue a HEAD request on every
152- # url we cannot know ahead of time for sure if something is HTML
153- # or not. However we can check after we've downloaded it.
154- content_type = resp .headers .get ('Content-Type' , 'unknown' )
155- if not content_type .lower ().startswith ("text/html" ):
156- logger .debug (
157- 'Skipping page %s because of Content-Type: %s' ,
158- link ,
159- content_type ,
160- )
161- return
162-
163- inst = HTMLPage (resp .content , resp .url , resp .headers )
164213 except requests .HTTPError as exc :
165214 _handle_get_page_fail (link , exc , url )
166215 except RetryError as exc :
@@ -174,7 +223,7 @@ def _get_html_page(link, session=None):
174223 except requests .Timeout :
175224 _handle_get_page_fail (link , "timed out" , url )
176225 else :
177- return inst
226+ return HTMLPage ( resp . content , resp . url , resp . headers )
178227
179228
180229class PackageFinder (object ):
@@ -679,7 +728,7 @@ def _get_pages(self, locations, project_name):
679728 continue
680729 seen .add (location )
681730
682- page = self . _get_page (location )
731+ page = _get_html_page (location , session = self . session )
683732 if page is None :
684733 continue
685734
@@ -796,9 +845,6 @@ def _link_package_versions(self, link, search):
796845
797846 return InstallationCandidate (search .supplied , version , link )
798847
799- def _get_page (self , link ):
800- return _get_html_page (link , session = self .session )
801-
802848
803849def egg_info_matches (
804850 egg_info , search_name , link ,
0 commit comments