xref: /trafficserver/proxy/hdrs/URL.cc (revision 4cfd5a73)
1 /** @file
2 
3   A brief file description
4 
5   @section license License
6 
7   Licensed to the Apache Software Foundation (ASF) under one
8   or more contributor license agreements.  See the NOTICE file
9   distributed with this work for additional information
10   regarding copyright ownership.  The ASF licenses this file
11   to you under the Apache License, Version 2.0 (the
12   "License"); you may not use this file except in compliance
13   with the License.  You may obtain a copy of the License at
14 
15       http://www.apache.org/licenses/LICENSE-2.0
16 
17   Unless required by applicable law or agreed to in writing, software
18   distributed under the License is distributed on an "AS IS" BASIS,
19   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20   See the License for the specific language governing permissions and
21   limitations under the License.
22  */
23 
24 #include <cassert>
25 #include <new>
26 #include "tscore/ink_platform.h"
27 #include "tscore/ink_memory.h"
28 #include "tscore/TsBuffer.h"
29 #include "URL.h"
30 #include "MIME.h"
31 #include "HTTP.h"
32 #include "tscore/Diags.h"
33 
34 const char *URL_SCHEME_FILE;
35 const char *URL_SCHEME_FTP;
36 const char *URL_SCHEME_GOPHER;
37 const char *URL_SCHEME_HTTP;
38 const char *URL_SCHEME_HTTPS;
39 const char *URL_SCHEME_WSS;
40 const char *URL_SCHEME_WS;
41 const char *URL_SCHEME_MAILTO;
42 const char *URL_SCHEME_NEWS;
43 const char *URL_SCHEME_NNTP;
44 const char *URL_SCHEME_PROSPERO;
45 const char *URL_SCHEME_TELNET;
46 const char *URL_SCHEME_TUNNEL;
47 const char *URL_SCHEME_WAIS;
48 const char *URL_SCHEME_PNM;
49 const char *URL_SCHEME_RTSP;
50 const char *URL_SCHEME_RTSPU;
51 const char *URL_SCHEME_MMS;
52 const char *URL_SCHEME_MMSU;
53 const char *URL_SCHEME_MMST;
54 
55 int URL_WKSIDX_FILE;
56 int URL_WKSIDX_FTP;
57 int URL_WKSIDX_GOPHER;
58 int URL_WKSIDX_HTTP;
59 int URL_WKSIDX_HTTPS;
60 int URL_WKSIDX_WS;
61 int URL_WKSIDX_WSS;
62 int URL_WKSIDX_MAILTO;
63 int URL_WKSIDX_NEWS;
64 int URL_WKSIDX_NNTP;
65 int URL_WKSIDX_PROSPERO;
66 int URL_WKSIDX_TELNET;
67 int URL_WKSIDX_TUNNEL;
68 int URL_WKSIDX_WAIS;
69 int URL_WKSIDX_PNM;
70 int URL_WKSIDX_RTSP;
71 int URL_WKSIDX_RTSPU;
72 int URL_WKSIDX_MMS;
73 int URL_WKSIDX_MMSU;
74 int URL_WKSIDX_MMST;
75 
76 int URL_LEN_FILE;
77 int URL_LEN_FTP;
78 int URL_LEN_GOPHER;
79 int URL_LEN_HTTP;
80 int URL_LEN_HTTPS;
81 int URL_LEN_WS;
82 int URL_LEN_WSS;
83 int URL_LEN_MAILTO;
84 int URL_LEN_NEWS;
85 int URL_LEN_NNTP;
86 int URL_LEN_PROSPERO;
87 int URL_LEN_TELNET;
88 int URL_LEN_TUNNEL;
89 int URL_LEN_WAIS;
90 int URL_LEN_PNM;
91 int URL_LEN_RTSP;
92 int URL_LEN_RTSPU;
93 int URL_LEN_MMS;
94 int URL_LEN_MMSU;
95 int URL_LEN_MMST;
96 
97 // Whether we should implement url_CryptoHash_get() using url_CryptoHash_get_fast(). Note that
98 // url_CryptoHash_get_fast() does NOT produce the same result as url_CryptoHash_get_general().
99 static int url_hash_method = 0;
100 
101 // test to see if a character is a valid character for a host in a URI according to
102 // RFC 3986 and RFC 1034
103 inline static int
is_host_char(char c)104 is_host_char(char c)
105 {
106   return (ParseRules::is_alnum(c) || (c == '-') || (c == '.') || (c == '[') || (c == ']') || (c == '_') || (c == ':') ||
107           (c == '~') || (c == '%'));
108 }
109 
110 // Checks if `addr` is a valid FQDN string
111 bool
validate_host_name(std::string_view addr)112 validate_host_name(std::string_view addr)
113 {
114   return std::all_of(addr.begin(), addr.end(), &is_host_char);
115 }
116 
117 /*-------------------------------------------------------------------------
118   -------------------------------------------------------------------------*/
119 
120 void
url_init()121 url_init()
122 {
123   static int init = 1;
124 
125   if (init) {
126     init = 0;
127 
128     hdrtoken_init();
129 
130     URL_SCHEME_FILE     = hdrtoken_string_to_wks("file");
131     URL_SCHEME_FTP      = hdrtoken_string_to_wks("ftp");
132     URL_SCHEME_GOPHER   = hdrtoken_string_to_wks("gopher");
133     URL_SCHEME_HTTP     = hdrtoken_string_to_wks("http");
134     URL_SCHEME_HTTPS    = hdrtoken_string_to_wks("https");
135     URL_SCHEME_WSS      = hdrtoken_string_to_wks("wss");
136     URL_SCHEME_WS       = hdrtoken_string_to_wks("ws");
137     URL_SCHEME_MAILTO   = hdrtoken_string_to_wks("mailto");
138     URL_SCHEME_NEWS     = hdrtoken_string_to_wks("news");
139     URL_SCHEME_NNTP     = hdrtoken_string_to_wks("nntp");
140     URL_SCHEME_PROSPERO = hdrtoken_string_to_wks("prospero");
141     URL_SCHEME_TELNET   = hdrtoken_string_to_wks("telnet");
142     URL_SCHEME_TUNNEL   = hdrtoken_string_to_wks("tunnel");
143     URL_SCHEME_WAIS     = hdrtoken_string_to_wks("wais");
144     URL_SCHEME_PNM      = hdrtoken_string_to_wks("pnm");
145     URL_SCHEME_RTSP     = hdrtoken_string_to_wks("rtsp");
146     URL_SCHEME_RTSPU    = hdrtoken_string_to_wks("rtspu");
147     URL_SCHEME_MMS      = hdrtoken_string_to_wks("mms");
148     URL_SCHEME_MMSU     = hdrtoken_string_to_wks("mmsu");
149     URL_SCHEME_MMST     = hdrtoken_string_to_wks("mmst");
150 
151     ink_assert(URL_SCHEME_FILE && URL_SCHEME_FTP && URL_SCHEME_GOPHER && URL_SCHEME_HTTP && URL_SCHEME_HTTPS && URL_SCHEME_WS &&
152                URL_SCHEME_WSS && URL_SCHEME_MAILTO && URL_SCHEME_NEWS && URL_SCHEME_NNTP && URL_SCHEME_PROSPERO &&
153                URL_SCHEME_TELNET && URL_SCHEME_TUNNEL && URL_SCHEME_WAIS && URL_SCHEME_PNM && URL_SCHEME_RTSP && URL_SCHEME_RTSPU &&
154                URL_SCHEME_MMS && URL_SCHEME_MMSU && URL_SCHEME_MMST);
155 
156     URL_WKSIDX_FILE     = hdrtoken_wks_to_index(URL_SCHEME_FILE);
157     URL_WKSIDX_FTP      = hdrtoken_wks_to_index(URL_SCHEME_FTP);
158     URL_WKSIDX_GOPHER   = hdrtoken_wks_to_index(URL_SCHEME_GOPHER);
159     URL_WKSIDX_HTTP     = hdrtoken_wks_to_index(URL_SCHEME_HTTP);
160     URL_WKSIDX_HTTPS    = hdrtoken_wks_to_index(URL_SCHEME_HTTPS);
161     URL_WKSIDX_WS       = hdrtoken_wks_to_index(URL_SCHEME_WS);
162     URL_WKSIDX_WSS      = hdrtoken_wks_to_index(URL_SCHEME_WSS);
163     URL_WKSIDX_MAILTO   = hdrtoken_wks_to_index(URL_SCHEME_MAILTO);
164     URL_WKSIDX_NEWS     = hdrtoken_wks_to_index(URL_SCHEME_NEWS);
165     URL_WKSIDX_NNTP     = hdrtoken_wks_to_index(URL_SCHEME_NNTP);
166     URL_WKSIDX_PROSPERO = hdrtoken_wks_to_index(URL_SCHEME_PROSPERO);
167     URL_WKSIDX_TELNET   = hdrtoken_wks_to_index(URL_SCHEME_TELNET);
168     URL_WKSIDX_TUNNEL   = hdrtoken_wks_to_index(URL_SCHEME_TUNNEL);
169     URL_WKSIDX_WAIS     = hdrtoken_wks_to_index(URL_SCHEME_WAIS);
170     URL_WKSIDX_PNM      = hdrtoken_wks_to_index(URL_SCHEME_PNM);
171     URL_WKSIDX_RTSP     = hdrtoken_wks_to_index(URL_SCHEME_RTSP);
172     URL_WKSIDX_RTSPU    = hdrtoken_wks_to_index(URL_SCHEME_RTSPU);
173     URL_WKSIDX_MMS      = hdrtoken_wks_to_index(URL_SCHEME_MMS);
174     URL_WKSIDX_MMSU     = hdrtoken_wks_to_index(URL_SCHEME_MMSU);
175     URL_WKSIDX_MMST     = hdrtoken_wks_to_index(URL_SCHEME_MMST);
176 
177     URL_LEN_FILE     = hdrtoken_wks_to_length(URL_SCHEME_FILE);
178     URL_LEN_FTP      = hdrtoken_wks_to_length(URL_SCHEME_FTP);
179     URL_LEN_GOPHER   = hdrtoken_wks_to_length(URL_SCHEME_GOPHER);
180     URL_LEN_HTTP     = hdrtoken_wks_to_length(URL_SCHEME_HTTP);
181     URL_LEN_HTTPS    = hdrtoken_wks_to_length(URL_SCHEME_HTTPS);
182     URL_LEN_WS       = hdrtoken_wks_to_length(URL_SCHEME_WS);
183     URL_LEN_WSS      = hdrtoken_wks_to_length(URL_SCHEME_WSS);
184     URL_LEN_MAILTO   = hdrtoken_wks_to_length(URL_SCHEME_MAILTO);
185     URL_LEN_NEWS     = hdrtoken_wks_to_length(URL_SCHEME_NEWS);
186     URL_LEN_NNTP     = hdrtoken_wks_to_length(URL_SCHEME_NNTP);
187     URL_LEN_PROSPERO = hdrtoken_wks_to_length(URL_SCHEME_PROSPERO);
188     URL_LEN_TELNET   = hdrtoken_wks_to_length(URL_SCHEME_TELNET);
189     URL_LEN_TUNNEL   = hdrtoken_wks_to_length(URL_SCHEME_TUNNEL);
190     URL_LEN_WAIS     = hdrtoken_wks_to_length(URL_SCHEME_WAIS);
191     URL_LEN_PNM      = hdrtoken_wks_to_length(URL_SCHEME_PNM);
192     URL_LEN_RTSP     = hdrtoken_wks_to_length(URL_SCHEME_RTSP);
193     URL_LEN_RTSPU    = hdrtoken_wks_to_length(URL_SCHEME_RTSPU);
194     URL_LEN_MMS      = hdrtoken_wks_to_length(URL_SCHEME_MMS);
195     URL_LEN_MMSU     = hdrtoken_wks_to_length(URL_SCHEME_MMSU);
196     URL_LEN_MMST     = hdrtoken_wks_to_length(URL_SCHEME_MMST);
197   }
198 }
199 
200 /*-------------------------------------------------------------------------
201   -------------------------------------------------------------------------*/
202 
203 /***********************************************************************
204  *                                                                     *
205  *             U R L    C R E A T I O N    A N D    C O P Y            *
206  *                                                                     *
207  ***********************************************************************/
208 
209 URLImpl *
url_create(HdrHeap * heap)210 url_create(HdrHeap *heap)
211 {
212   URLImpl *url;
213 
214   url = (URLImpl *)heap->allocate_obj(sizeof(URLImpl), HDR_HEAP_OBJ_URL);
215   obj_clear_data((HdrHeapObjImpl *)url);
216   url->m_url_type       = URL_TYPE_NONE;
217   url->m_scheme_wks_idx = -1;
218   url_clear_string_ref(url);
219   return url;
220 }
221 
222 /*-------------------------------------------------------------------------
223   -------------------------------------------------------------------------*/
224 
225 void
url_clear(URLImpl * url_impl)226 url_clear(URLImpl *url_impl)
227 {
228   obj_clear_data((HdrHeapObjImpl *)url_impl);
229   url_impl->m_url_type       = URL_TYPE_NONE;
230   url_impl->m_scheme_wks_idx = -1;
231 }
232 
233 /*-------------------------------------------------------------------------
234   -------------------------------------------------------------------------*/
235 
236 URLImpl *
url_copy(URLImpl * s_url,HdrHeap * s_heap,HdrHeap * d_heap,bool inherit_strs)237 url_copy(URLImpl *s_url, HdrHeap *s_heap, HdrHeap *d_heap, bool inherit_strs)
238 {
239   URLImpl *d_url = url_create(d_heap);
240   url_copy_onto(s_url, s_heap, d_url, d_heap, inherit_strs);
241   return d_url;
242 }
243 
244 /*-------------------------------------------------------------------------
245   -------------------------------------------------------------------------*/
246 
247 void
url_copy_onto(URLImpl * s_url,HdrHeap * s_heap,URLImpl * d_url,HdrHeap * d_heap,bool inherit_strs)248 url_copy_onto(URLImpl *s_url, HdrHeap *s_heap, URLImpl *d_url, HdrHeap *d_heap, bool inherit_strs)
249 {
250   if (s_url != d_url) {
251     obj_copy_data((HdrHeapObjImpl *)s_url, (HdrHeapObjImpl *)d_url);
252     if (inherit_strs && (s_heap != d_heap)) {
253       d_heap->inherit_string_heaps(s_heap);
254     }
255   }
256 }
257 
258 /*-------------------------------------------------------------------------
259   -------------------------------------------------------------------------*/
260 
261 void
url_nuke_proxy_stuff(URLImpl * d_url)262 url_nuke_proxy_stuff(URLImpl *d_url)
263 {
264   d_url->m_len_scheme   = 0;
265   d_url->m_len_user     = 0;
266   d_url->m_len_password = 0;
267   d_url->m_len_host     = 0;
268   d_url->m_len_port     = 0;
269 
270   d_url->m_ptr_scheme   = nullptr;
271   d_url->m_ptr_user     = nullptr;
272   d_url->m_ptr_password = nullptr;
273   d_url->m_ptr_host     = nullptr;
274   d_url->m_ptr_port     = nullptr;
275 
276   d_url->m_scheme_wks_idx = -1;
277   d_url->m_port           = 0;
278 }
279 
280 /*-------------------------------------------------------------------------
281   -------------------------------------------------------------------------*/
282 
283 /**
284   This routine is like url_copy_onto, but clears the
285   scheme/host/user/pass/port components, resulting in a server-style URL.
286 
287 */
288 void
url_copy_onto_as_server_url(URLImpl * s_url,HdrHeap * s_heap,URLImpl * d_url,HdrHeap * d_heap,bool inherit_strs)289 url_copy_onto_as_server_url(URLImpl *s_url, HdrHeap *s_heap, URLImpl *d_url, HdrHeap *d_heap, bool inherit_strs)
290 {
291   url_nuke_proxy_stuff(d_url);
292 
293   d_url->m_ptr_path     = s_url->m_ptr_path;
294   d_url->m_ptr_params   = s_url->m_ptr_params;
295   d_url->m_ptr_query    = s_url->m_ptr_query;
296   d_url->m_ptr_fragment = s_url->m_ptr_fragment;
297   url_clear_string_ref(d_url);
298 
299   d_url->m_len_path     = s_url->m_len_path;
300   d_url->m_len_params   = s_url->m_len_params;
301   d_url->m_len_query    = s_url->m_len_query;
302   d_url->m_len_fragment = s_url->m_len_fragment;
303 
304   d_url->m_url_type  = s_url->m_url_type;
305   d_url->m_type_code = s_url->m_type_code;
306 
307   if (inherit_strs && (s_heap != d_heap)) {
308     d_heap->inherit_string_heaps(s_heap);
309   }
310 }
311 
312 /*-------------------------------------------------------------------------
313   -------------------------------------------------------------------------*/
314 
315 /***********************************************************************
316  *                                                                     *
317  *                        M A R S H A L I N G                          *
318  *                                                                     *
319  ***********************************************************************/
320 int
marshal(MarshalXlate * str_xlate,int num_xlate)321 URLImpl::marshal(MarshalXlate *str_xlate, int num_xlate)
322 {
323   HDR_MARSHAL_STR(m_ptr_scheme, str_xlate, num_xlate);
324   HDR_MARSHAL_STR(m_ptr_user, str_xlate, num_xlate);
325   HDR_MARSHAL_STR(m_ptr_password, str_xlate, num_xlate);
326   HDR_MARSHAL_STR(m_ptr_host, str_xlate, num_xlate);
327   HDR_MARSHAL_STR(m_ptr_port, str_xlate, num_xlate);
328   HDR_MARSHAL_STR(m_ptr_path, str_xlate, num_xlate);
329   HDR_MARSHAL_STR(m_ptr_params, str_xlate, num_xlate);
330   HDR_MARSHAL_STR(m_ptr_query, str_xlate, num_xlate);
331   HDR_MARSHAL_STR(m_ptr_fragment, str_xlate, num_xlate);
332   //    HDR_MARSHAL_STR(m_ptr_printed_string, str_xlate, num_xlate);
333   return 0;
334 }
335 
336 void
unmarshal(intptr_t offset)337 URLImpl::unmarshal(intptr_t offset)
338 {
339   HDR_UNMARSHAL_STR(m_ptr_scheme, offset);
340   HDR_UNMARSHAL_STR(m_ptr_user, offset);
341   HDR_UNMARSHAL_STR(m_ptr_password, offset);
342   HDR_UNMARSHAL_STR(m_ptr_host, offset);
343   HDR_UNMARSHAL_STR(m_ptr_port, offset);
344   HDR_UNMARSHAL_STR(m_ptr_path, offset);
345   HDR_UNMARSHAL_STR(m_ptr_params, offset);
346   HDR_UNMARSHAL_STR(m_ptr_query, offset);
347   HDR_UNMARSHAL_STR(m_ptr_fragment, offset);
348   //    HDR_UNMARSHAL_STR(m_ptr_printed_string, offset);
349 }
350 
351 void
rehome_strings(HdrHeap * new_heap)352 URLImpl::rehome_strings(HdrHeap *new_heap)
353 {
354   m_ptr_scheme         = new_heap->localize({m_ptr_scheme, m_len_scheme}).data();
355   m_ptr_user           = new_heap->localize({m_ptr_user, m_len_user}).data();
356   m_ptr_password       = new_heap->localize({m_ptr_password, m_len_password}).data();
357   m_ptr_host           = new_heap->localize({m_ptr_host, m_len_host}).data();
358   m_ptr_port           = new_heap->localize({m_ptr_port, m_len_port}).data();
359   m_ptr_path           = new_heap->localize({m_ptr_path, m_len_path}).data();
360   m_ptr_params         = new_heap->localize({m_ptr_params, m_len_params}).data();
361   m_ptr_query          = new_heap->localize({m_ptr_query, m_len_query}).data();
362   m_ptr_fragment       = new_heap->localize({m_ptr_fragment, m_len_fragment}).data();
363   m_ptr_printed_string = new_heap->localize({m_ptr_printed_string, m_len_printed_string}).data();
364 }
365 
366 void
move_strings(HdrStrHeap * new_heap)367 URLImpl::move_strings(HdrStrHeap *new_heap)
368 {
369   HDR_MOVE_STR(m_ptr_scheme, m_len_scheme);
370   HDR_MOVE_STR(m_ptr_user, m_len_user);
371   HDR_MOVE_STR(m_ptr_password, m_len_password);
372   HDR_MOVE_STR(m_ptr_host, m_len_host);
373   HDR_MOVE_STR(m_ptr_port, m_len_port);
374   HDR_MOVE_STR(m_ptr_path, m_len_path);
375   HDR_MOVE_STR(m_ptr_params, m_len_params);
376   HDR_MOVE_STR(m_ptr_query, m_len_query);
377   HDR_MOVE_STR(m_ptr_fragment, m_len_fragment);
378   HDR_MOVE_STR(m_ptr_printed_string, m_len_printed_string);
379 }
380 
381 size_t
strings_length()382 URLImpl::strings_length()
383 {
384   size_t ret = 0;
385 
386   ret += m_len_scheme;
387   ret += m_len_user;
388   ret += m_len_password;
389   ret += m_len_host;
390   ret += m_len_port;
391   ret += m_len_path;
392   ret += m_len_params;
393   ret += m_len_query;
394   ret += m_len_fragment;
395   ret += m_len_printed_string;
396   return ret;
397 }
398 
399 void
check_strings(HeapCheck * heaps,int num_heaps)400 URLImpl::check_strings(HeapCheck *heaps, int num_heaps)
401 {
402   CHECK_STR(m_ptr_scheme, m_len_scheme, heaps, num_heaps);
403   CHECK_STR(m_ptr_user, m_len_user, heaps, num_heaps);
404   CHECK_STR(m_ptr_password, m_len_password, heaps, num_heaps);
405   CHECK_STR(m_ptr_host, m_len_host, heaps, num_heaps);
406   CHECK_STR(m_ptr_port, m_len_port, heaps, num_heaps);
407   CHECK_STR(m_ptr_path, m_len_path, heaps, num_heaps);
408   CHECK_STR(m_ptr_params, m_len_params, heaps, num_heaps);
409   CHECK_STR(m_ptr_query, m_len_query, heaps, num_heaps);
410   CHECK_STR(m_ptr_fragment, m_len_fragment, heaps, num_heaps);
411   //    CHECK_STR(m_ptr_printed_string, m_len_printed_string, heaps, num_heaps);
412 }
413 
414 /***********************************************************************
415  *                                                                     *
416  *                               S E T                                 *
417  *                                                                     *
418  ***********************************************************************/
419 
420 const char *
url_scheme_set(HdrHeap * heap,URLImpl * url,const char * scheme_str,int scheme_wks_idx,int length,bool copy_string)421 url_scheme_set(HdrHeap *heap, URLImpl *url, const char *scheme_str, int scheme_wks_idx, int length, bool copy_string)
422 {
423   const char *scheme_wks;
424   url_called_set(url);
425   if (length == 0) {
426     scheme_str = nullptr;
427   }
428 
429   mime_str_u16_set(heap, scheme_str, length, &(url->m_ptr_scheme), &(url->m_len_scheme), copy_string);
430 
431   url->m_scheme_wks_idx = scheme_wks_idx;
432   if (scheme_wks_idx >= 0) {
433     scheme_wks = hdrtoken_index_to_wks(scheme_wks_idx);
434   } else {
435     scheme_wks = nullptr;
436   }
437 
438   if (scheme_wks == URL_SCHEME_HTTP || scheme_wks == URL_SCHEME_WS) {
439     url->m_url_type = URL_TYPE_HTTP;
440   } else if (scheme_wks == URL_SCHEME_HTTPS || scheme_wks == URL_SCHEME_WSS) {
441     url->m_url_type = URL_TYPE_HTTPS;
442   } else {
443     url->m_url_type = URL_TYPE_HTTP;
444   }
445 
446   return scheme_wks; // tokenized string or NULL if not well known
447 }
448 
449 /*-------------------------------------------------------------------------
450   -------------------------------------------------------------------------*/
451 
452 void
url_user_set(HdrHeap * heap,URLImpl * url,const char * value,int length,bool copy_string)453 url_user_set(HdrHeap *heap, URLImpl *url, const char *value, int length, bool copy_string)
454 {
455   url_called_set(url);
456   if (length == 0) {
457     value = nullptr;
458   }
459   mime_str_u16_set(heap, value, length, &(url->m_ptr_user), &(url->m_len_user), copy_string);
460 }
461 
462 /*-------------------------------------------------------------------------
463   -------------------------------------------------------------------------*/
464 
465 void
url_password_set(HdrHeap * heap,URLImpl * url,const char * value,int length,bool copy_string)466 url_password_set(HdrHeap *heap, URLImpl *url, const char *value, int length, bool copy_string)
467 {
468   url_called_set(url);
469   if (length == 0) {
470     value = nullptr;
471   }
472   mime_str_u16_set(heap, value, length, &(url->m_ptr_password), &(url->m_len_password), copy_string);
473 }
474 
475 /*-------------------------------------------------------------------------
476   -------------------------------------------------------------------------*/
477 
478 void
url_host_set(HdrHeap * heap,URLImpl * url,const char * value,int length,bool copy_string)479 url_host_set(HdrHeap *heap, URLImpl *url, const char *value, int length, bool copy_string)
480 {
481   url_called_set(url);
482   if (length == 0) {
483     value = nullptr;
484   }
485   mime_str_u16_set(heap, value, length, &(url->m_ptr_host), &(url->m_len_host), copy_string);
486 }
487 
488 /*-------------------------------------------------------------------------
489   -------------------------------------------------------------------------*/
490 
491 void
url_port_set(HdrHeap * heap,URLImpl * url,const char * value,int length,bool copy_string)492 url_port_set(HdrHeap *heap, URLImpl *url, const char *value, int length, bool copy_string)
493 {
494   url_called_set(url);
495   if (length == 0) {
496     value = nullptr;
497   }
498   mime_str_u16_set(heap, value, length, &(url->m_ptr_port), &(url->m_len_port), copy_string);
499 
500   url->m_port = 0;
501   for (int i = 0; i < length; i++) {
502     if (!ParseRules::is_digit(value[i])) {
503       break;
504     }
505     url->m_port = url->m_port * 10 + (value[i] - '0');
506   }
507 }
508 
509 /*-------------------------------------------------------------------------
510   -------------------------------------------------------------------------*/
511 
512 void
url_port_set(HdrHeap * heap,URLImpl * url,unsigned int port)513 url_port_set(HdrHeap *heap, URLImpl *url, unsigned int port)
514 {
515   url_called_set(url);
516   if (port > 0) {
517     char value[6];
518     int length;
519 
520     length = ink_fast_itoa(port, value, sizeof(value));
521     mime_str_u16_set(heap, value, length, &(url->m_ptr_port), &(url->m_len_port), true);
522   } else {
523     mime_str_u16_set(heap, nullptr, 0, &(url->m_ptr_port), &(url->m_len_port), true);
524   }
525   url->m_port = port;
526 }
527 
528 /*-------------------------------------------------------------------------
529   -------------------------------------------------------------------------*/
530 
531 void
url_path_set(HdrHeap * heap,URLImpl * url,const char * value,int length,bool copy_string)532 url_path_set(HdrHeap *heap, URLImpl *url, const char *value, int length, bool copy_string)
533 {
534   url_called_set(url);
535   if (length == 0) {
536     value = nullptr;
537   }
538   mime_str_u16_set(heap, value, length, &(url->m_ptr_path), &(url->m_len_path), copy_string);
539 }
540 
541 /*-------------------------------------------------------------------------
542   -------------------------------------------------------------------------*/
543 
544 // empties params/query/fragment component
545 // url_{params|query|fragment}_set()
546 
547 void
url_params_set(HdrHeap * heap,URLImpl * url,const char * value,int length,bool copy_string)548 url_params_set(HdrHeap *heap, URLImpl *url, const char *value, int length, bool copy_string)
549 {
550   url_called_set(url);
551   mime_str_u16_set(heap, value, length, &(url->m_ptr_params), &(url->m_len_params), copy_string);
552 }
553 
554 /*-------------------------------------------------------------------------
555   -------------------------------------------------------------------------*/
556 
557 void
url_query_set(HdrHeap * heap,URLImpl * url,const char * value,int length,bool copy_string)558 url_query_set(HdrHeap *heap, URLImpl *url, const char *value, int length, bool copy_string)
559 {
560   url_called_set(url);
561   mime_str_u16_set(heap, value, length, &(url->m_ptr_query), &(url->m_len_query), copy_string);
562 }
563 
564 /*-------------------------------------------------------------------------
565   -------------------------------------------------------------------------*/
566 
567 void
url_fragment_set(HdrHeap * heap,URLImpl * url,const char * value,int length,bool copy_string)568 url_fragment_set(HdrHeap *heap, URLImpl *url, const char *value, int length, bool copy_string)
569 {
570   url_called_set(url);
571   mime_str_u16_set(heap, value, length, &(url->m_ptr_fragment), &(url->m_len_fragment), copy_string);
572 }
573 
574 /*-------------------------------------------------------------------------
575   -------------------------------------------------------------------------*/
576 
577 void
url_type_set(URLImpl * url,unsigned int typecode)578 url_type_set(URLImpl *url, unsigned int typecode)
579 {
580   url_called_set(url);
581   url->m_type_code = typecode;
582 }
583 
584 /*-------------------------------------------------------------------------
585   -------------------------------------------------------------------------*/
586 
587 /***********************************************************************
588  *                                                                     *
589  *                               G E T                                 *
590  *                                                                     *
591  ***********************************************************************/
592 
593 /*-------------------------------------------------------------------------
594   -------------------------------------------------------------------------*/
595 
596 void
url_called_set(URLImpl * url)597 url_called_set(URLImpl *url)
598 {
599   url->m_clean = !url->m_ptr_printed_string;
600 }
601 
602 void
url_clear_string_ref(URLImpl * url)603 url_clear_string_ref(URLImpl *url)
604 {
605   if (url->m_ptr_printed_string) {
606     url->m_len_printed_string = 0;
607     url->m_ptr_printed_string = nullptr;
608     url->m_clean              = true;
609   }
610   return;
611 }
612 
613 char *
url_string_get_ref(HdrHeap * heap,URLImpl * url,int * length,bool normalized)614 url_string_get_ref(HdrHeap *heap, URLImpl *url, int *length, bool normalized)
615 {
616   if (!url) {
617     return nullptr;
618   }
619 
620   if (url->m_ptr_printed_string && url->m_clean) {
621     if (length) {
622       *length = url->m_len_printed_string;
623     }
624     return const_cast<char *>(url->m_ptr_printed_string);
625   } else { // either not clean or never printed
626     int len = url_length_get(url);
627     char *buf;
628     int index  = 0;
629     int offset = 0;
630 
631     /* stuff alloc'd here gets gc'd on HdrHeap::destroy() */
632     buf = heap->allocate_str(len + 1);
633     url_print(url, buf, len, &index, &offset, normalized);
634     buf[len] = '\0';
635 
636     if (length) {
637       *length = len;
638     }
639     url->m_clean              = true; // reset since we have url_print()'ed again
640     url->m_len_printed_string = len;
641     url->m_ptr_printed_string = buf;
642     return buf;
643   }
644 }
645 
646 char *
url_string_get(URLImpl * url,Arena * arena,int * length,HdrHeap * heap,bool normalized)647 url_string_get(URLImpl *url, Arena *arena, int *length, HdrHeap *heap, bool normalized)
648 {
649   int len = url_length_get(url);
650   char *buf;
651   char *buf2;
652   int index  = 0;
653   int offset = 0;
654 
655   buf = arena ? arena->str_alloc(len) : static_cast<char *>(ats_malloc(len + 1));
656 
657   url_print(url, buf, len, &index, &offset, normalized);
658   buf[len] = '\0';
659 
660   /* see string_get_ref() */
661   if (heap) {
662     buf2 = heap->allocate_str(len + 1);
663     memcpy(buf2, buf, len);
664     buf2[len]                 = '\0';
665     url->m_clean              = true; // reset since we have url_print()'ed again
666     url->m_len_printed_string = len;
667     url->m_ptr_printed_string = buf2;
668   }
669 
670   if (length) {
671     *length = len;
672   }
673   return buf;
674 }
675 
676 /*-------------------------------------------------------------------------
677   -------------------------------------------------------------------------*/
678 
679 char *
url_string_get_buf(URLImpl * url,char * dstbuf,int dstbuf_size,int * length)680 url_string_get_buf(URLImpl *url, char *dstbuf, int dstbuf_size, int *length)
681 {
682   int len    = url_length_get(url);
683   int index  = 0;
684   int offset = 0;
685   char *buf  = nullptr;
686 
687   if (dstbuf && dstbuf_size > 0) {
688     buf = dstbuf;
689     if (len >= dstbuf_size) {
690       len = dstbuf_size - 1;
691     }
692     url_print(url, dstbuf, len, &index, &offset);
693     buf[len] = 0;
694 
695     if (length) {
696       *length = len;
697     }
698   }
699   return buf;
700 }
701 
702 /*-------------------------------------------------------------------------
703   -------------------------------------------------------------------------*/
704 
705 const char *
url_user_get(URLImpl * url,int * length)706 url_user_get(URLImpl *url, int *length)
707 {
708   *length = url->m_len_user;
709   return url->m_ptr_user;
710 }
711 
712 /*-------------------------------------------------------------------------
713   -------------------------------------------------------------------------*/
714 
715 const char *
url_password_get(URLImpl * url,int * length)716 url_password_get(URLImpl *url, int *length)
717 {
718   *length = url->m_len_password;
719   return url->m_ptr_password;
720 }
721 
722 /*-------------------------------------------------------------------------
723   -------------------------------------------------------------------------*/
724 
725 const char *
url_host_get(URLImpl * url,int * length)726 url_host_get(URLImpl *url, int *length)
727 {
728   *length = url->m_len_host;
729   return url->m_ptr_host;
730 }
731 
732 /*-------------------------------------------------------------------------
733   -------------------------------------------------------------------------*/
734 
735 int
url_port_get(URLImpl * url)736 url_port_get(URLImpl *url)
737 {
738   return url->m_port;
739 }
740 
741 /*-------------------------------------------------------------------------
742   -------------------------------------------------------------------------*/
743 
744 const char *
url_path_get(URLImpl * url,int * length)745 url_path_get(URLImpl *url, int *length)
746 {
747   *length = url->m_len_path;
748   return url->m_ptr_path;
749 }
750 
751 /*-------------------------------------------------------------------------
752   -------------------------------------------------------------------------*/
753 
754 const char *
url_params_get(URLImpl * url,int * length)755 url_params_get(URLImpl *url, int *length)
756 {
757   *length = url->m_len_params;
758   return url->m_ptr_params;
759 }
760 
761 /*-------------------------------------------------------------------------
762   -------------------------------------------------------------------------*/
763 
764 const char *
url_query_get(URLImpl * url,int * length)765 url_query_get(URLImpl *url, int *length)
766 {
767   *length = url->m_len_query;
768   return url->m_ptr_query;
769 }
770 
771 /*-------------------------------------------------------------------------
772   -------------------------------------------------------------------------*/
773 
774 const char *
url_fragment_get(URLImpl * url,int * length)775 url_fragment_get(URLImpl *url, int *length)
776 {
777   *length = url->m_len_fragment;
778   return url->m_ptr_fragment;
779 }
780 
781 /*-------------------------------------------------------------------------
782   -------------------------------------------------------------------------*/
783 
784 int
url_type_get(URLImpl * url)785 url_type_get(URLImpl *url)
786 {
787   return url->m_type_code;
788 }
789 
790 /*-------------------------------------------------------------------------
791   -------------------------------------------------------------------------*/
792 
793 /***********************************************************************
794  *                                                                     *
795  *               U R L    S T R I N G    F U N C T I O N S             *
796  *                                                                     *
797  ***********************************************************************/
798 
799 /*-------------------------------------------------------------------------
800   -------------------------------------------------------------------------*/
801 
802 int
url_length_get(URLImpl * url)803 url_length_get(URLImpl *url)
804 {
805   int length = 0;
806 
807   if (url->m_ptr_scheme) {
808     if ((url->m_scheme_wks_idx >= 0) && (hdrtoken_index_to_wks(url->m_scheme_wks_idx) == URL_SCHEME_FILE)) {
809       length += url->m_len_scheme + 1; // +1 for ":"
810     } else {
811       length += url->m_len_scheme + 3; // +3 for "://"
812     }
813   }
814 
815   if (url->m_ptr_user) {
816     length += url->m_len_user + 1; // +1 for "@"
817     if (url->m_ptr_password) {
818       length += url->m_len_password + 1; // +1 for ":"
819     }
820   }
821 
822   if (url->m_ptr_host) {
823     length += url->m_len_host;
824     if (url->m_ptr_port && url->m_port) {
825       length += url->m_len_port + 1; // +1 for ":"
826     }
827   }
828 
829   if (url->m_ptr_path) {
830     length += url->m_len_path + 1; // +1 for /
831   } else {
832     length += 1; // +1 for /
833   }
834 
835   if (url->m_ptr_params && url->m_len_params > 0) {
836     length += url->m_len_params + 1; // +1 for ";"
837   }
838 
839   if (url->m_ptr_query && url->m_len_query > 0) {
840     length += url->m_len_query + 1; // +1 for "?"
841   }
842 
843   if (url->m_ptr_fragment && url->m_len_fragment > 0) {
844     length += url->m_len_fragment + 1; // +1 for "#"
845   }
846 
847   return length;
848 }
849 
850 /*-------------------------------------------------------------------------
851   -------------------------------------------------------------------------*/
852 
853 char *
url_to_string(URLImpl * url,Arena * arena,int * length)854 url_to_string(URLImpl *url, Arena *arena, int *length)
855 {
856   int len;
857   int idx;
858   char *str;
859 
860   len = url_length_get(url) + 1;
861 
862   if (length) {
863     *length = len;
864   }
865 
866   if (arena) {
867     str = arena->str_alloc(len);
868   } else {
869     str = static_cast<char *>(ats_malloc(len + 1));
870   }
871 
872   idx = 0;
873 
874   if (url->m_ptr_scheme) {
875     memcpy(&str[idx], url->m_ptr_scheme, url->m_len_scheme);
876     idx += url->m_len_scheme;
877     if ((url->m_scheme_wks_idx >= 0) && (hdrtoken_index_to_wks(url->m_scheme_wks_idx) == URL_SCHEME_FILE)) {
878       str[idx++] = ':';
879     } else {
880       str[idx++] = ':';
881       str[idx++] = '/';
882       str[idx++] = '/';
883     }
884   }
885 
886   if (url->m_ptr_user) {
887     memcpy(&str[idx], url->m_ptr_user, url->m_len_user);
888     idx += url->m_len_user;
889     if (url->m_ptr_password) {
890       str[idx++] = ':';
891       memcpy(&str[idx], url->m_ptr_password, url->m_len_password);
892       idx += url->m_len_password;
893     }
894     str[idx++] = '@';
895   }
896 
897   if (url->m_ptr_host) {
898     memcpy(&str[idx], url->m_ptr_host, url->m_len_host);
899     idx += url->m_len_host;
900     if (url->m_ptr_port != nullptr) {
901       str[idx++] = ':';
902       memcpy(&str[idx], url->m_ptr_port, url->m_len_port);
903       idx += url->m_len_port;
904     }
905   }
906 
907   memcpy(&str[idx], url->m_ptr_path, url->m_len_path);
908   idx += url->m_len_path;
909 
910   if (url->m_ptr_params && url->m_len_params > 0) {
911     str[idx++] = ';';
912     memcpy(&str[idx], url->m_ptr_params, url->m_len_params);
913     idx += url->m_len_params;
914   }
915 
916   if (url->m_ptr_query && url->m_len_query > 0) {
917     str[idx++] = '?';
918     memcpy(&str[idx], url->m_ptr_query, url->m_len_query);
919     idx += url->m_len_query;
920   }
921 
922   if (url->m_ptr_fragment && url->m_len_fragment > 0) {
923     str[idx++] = '#';
924     memcpy(&str[idx], url->m_ptr_fragment, url->m_len_fragment);
925     idx += url->m_len_fragment;
926   }
927 
928   str[idx++] = '\0';
929 
930   ink_release_assert(idx == len);
931 
932   return str;
933 }
934 
935 /*-------------------------------------------------------------------------
936   -------------------------------------------------------------------------*/
937 
938 /***********************************************************************
939  *                                                                     *
940  *                     E S C A P E - H A N D L I N G                   *
941  *                                                                     *
942  ***********************************************************************/
943 
944 void
unescape_str(char * & buf,char * buf_e,const char * & str,const char * str_e,int & state)945 unescape_str(char *&buf, char *buf_e, const char *&str, const char *str_e, int &state)
946 {
947   int copy_len;
948   char *first_pct;
949   int buf_len = static_cast<int>(buf_e - buf);
950   int str_len = static_cast<int>(str_e - str);
951   int min_len = (str_len < buf_len ? str_len : buf_len);
952 
953   first_pct = ink_memcpy_until_char(buf, const_cast<char *>(str), min_len, '%');
954   copy_len  = static_cast<int>(first_pct - str);
955   str += copy_len;
956   buf += copy_len;
957   if (copy_len == min_len) {
958     return;
959   }
960 
961   while (str < str_e && (buf != buf_e)) {
962     switch (state) {
963     case 0:
964       if (str[0] == '%') {
965         str += 1;
966         state = 1;
967       } else {
968         *buf++ = str[0];
969         str += 1;
970       }
971       break;
972     case 1:
973       if (ParseRules::is_hex(str[0])) {
974         str += 1;
975         state = 2;
976       } else {
977         *buf++ = str[-1];
978         state  = 0;
979       }
980       break;
981     case 2:
982       if (ParseRules::is_hex(str[0])) {
983         int tmp;
984 
985         if (ParseRules::is_alpha(str[-1])) {
986           tmp = (ParseRules::ink_toupper(str[-1]) - 'A' + 10) * 16;
987         } else {
988           tmp = (str[-1] - '0') * 16;
989         }
990         if (ParseRules::is_alpha(str[0])) {
991           tmp += (ParseRules::ink_toupper(str[0]) - 'A' + 10);
992         } else {
993           tmp += str[0] - '0';
994         }
995 
996         *buf++ = tmp;
997         str += 1;
998         state = 0;
999       } else {
1000         *buf++ = str[-2];
1001         state  = 3;
1002       }
1003       break;
1004     case 3:
1005       *buf++ = str[-1];
1006       state  = 0;
1007       break;
1008     }
1009   }
1010 }
1011 
1012 /*-------------------------------------------------------------------------
1013   -------------------------------------------------------------------------*/
1014 
1015 void
unescape_str_tolower(char * & buf,char * end,const char * & str,const char * str_e,int & state)1016 unescape_str_tolower(char *&buf, char *end, const char *&str, const char *str_e, int &state)
1017 {
1018   while (str < str_e && (buf != end)) {
1019     switch (state) {
1020     case 0:
1021       if (str[0] == '%') {
1022         str += 1;
1023         state = 1;
1024       } else {
1025         *buf++ = ParseRules::ink_tolower(str[0]);
1026         str += 1;
1027       }
1028       break;
1029     case 1:
1030       if (ParseRules::is_hex(str[0])) {
1031         str += 1;
1032         state = 2;
1033       } else {
1034         *buf++ = ParseRules::ink_tolower(str[-1]);
1035         state  = 0;
1036       }
1037       break;
1038     case 2:
1039       if (ParseRules::is_hex(str[0])) {
1040         int tmp;
1041 
1042         if (ParseRules::is_alpha(str[-1])) {
1043           tmp = (ParseRules::ink_toupper(str[-1]) - 'A' + 10) * 16;
1044         } else {
1045           tmp = (str[-1] - '0') * 16;
1046         }
1047         if (ParseRules::is_alpha(str[0])) {
1048           tmp += (ParseRules::ink_toupper(str[0]) - 'A' + 10);
1049         } else {
1050           tmp += str[0] - '0';
1051         }
1052 
1053         *buf++ = tmp;
1054         str += 1;
1055         state = 0;
1056       } else {
1057         *buf++ = ParseRules::ink_tolower(str[-2]);
1058         state  = 3;
1059       }
1060       break;
1061     case 3:
1062       *buf++ = ParseRules::ink_tolower(str[-1]);
1063       state  = 0;
1064       break;
1065     }
1066   }
1067 }
1068 
1069 /*-------------------------------------------------------------------------
1070   -------------------------------------------------------------------------*/
1071 
1072 char *
url_unescapify(Arena * arena,const char * str,int length)1073 url_unescapify(Arena *arena, const char *str, int length)
1074 {
1075   char *buffer;
1076   char *t, *e;
1077   int s;
1078 
1079   if (length == -1) {
1080     length = static_cast<int>(strlen(str));
1081   }
1082 
1083   buffer = arena->str_alloc(length);
1084   t      = buffer;
1085   e      = buffer + length;
1086   s      = 0;
1087 
1088   unescape_str(t, e, str, str + length, s);
1089   *t = '\0';
1090 
1091   return buffer;
1092 }
1093 
1094 /*-------------------------------------------------------------------------
1095   -------------------------------------------------------------------------*/
1096 
1097 /***********************************************************************
1098  *                                                                     *
1099  *                            P A R S I N G                            *
1100  *                                                                     *
1101  ***********************************************************************/
1102 
1103 #define GETNEXT(label) \
1104   {                    \
1105     cur += 1;          \
1106     if (cur >= end) {  \
1107       goto label;      \
1108     }                  \
1109   }
1110 
1111 ParseResult
url_parse_scheme(HdrHeap * heap,URLImpl * url,const char ** start,const char * end,bool copy_strings_p)1112 url_parse_scheme(HdrHeap *heap, URLImpl *url, const char **start, const char *end, bool copy_strings_p)
1113 {
1114   const char *cur = *start;
1115   const char *scheme_wks;
1116   const char *scheme_start = nullptr;
1117   const char *scheme_end   = nullptr;
1118   int scheme_wks_idx;
1119 
1120   while (' ' == *cur && ++cur < end) {
1121     ;
1122   }
1123   if (cur < end) {
1124     scheme_start = scheme_end = cur;
1125     // special case 'http:' for performance
1126     if ((end - cur >= 5) && (((cur[0] ^ 'h') | (cur[1] ^ 't') | (cur[2] ^ 't') | (cur[3] ^ 'p') | (cur[4] ^ ':')) == 0)) {
1127       scheme_end = cur + 4; // point to colon
1128       url_scheme_set(heap, url, scheme_start, URL_WKSIDX_HTTP, 4, copy_strings_p);
1129     } else if ('/' != *cur) {
1130       // For forward transparent mode, the URL for the method can just be a path,
1131       // so don't scan that for a scheme, as we could find a false positive if there
1132       // is a URL in the parameters (which is legal).
1133       while (':' != *cur && ++cur < end) {
1134         ;
1135       }
1136       if (cur < end) { // found a colon
1137         scheme_wks_idx = hdrtoken_tokenize(scheme_start, cur - scheme_start, &scheme_wks);
1138 
1139         /*  Distinguish between a scheme only and a username by looking past the colon. If it is missing
1140             or it's a slash, presume scheme. Otherwise it's a username with a password.
1141         */
1142         if ((scheme_wks_idx > 0 && hdrtoken_wks_to_token_type(scheme_wks) == HDRTOKEN_TYPE_SCHEME) || // known scheme
1143             (cur >= end - 1 || cur[1] == '/')) // no more data or slash past colon
1144         {
1145           scheme_end = cur;
1146           url_scheme_set(heap, url, scheme_start, scheme_wks_idx, scheme_end - scheme_start, copy_strings_p);
1147         }
1148       }
1149     }
1150     *start = scheme_end;
1151     return PARSE_RESULT_CONT;
1152   }
1153   return PARSE_RESULT_ERROR; // no non-whitespace found
1154 }
1155 
1156 /**
1157  *  This method will return TRUE if the uri is strictly compliant with
1158  *  RFC 3986 and it will return FALSE if not.
1159  */
1160 static bool
url_is_strictly_compliant(const char * start,const char * end)1161 url_is_strictly_compliant(const char *start, const char *end)
1162 {
1163   for (const char *i = start; i < end; ++i) {
1164     if (!ParseRules::is_uri(*i)) {
1165       Debug("http", "Non-RFC compliant character [0x%.2X] found in URL", (unsigned char)*i);
1166       return false;
1167     }
1168   }
1169   return true;
1170 }
1171 
1172 ParseResult
url_parse(HdrHeap * heap,URLImpl * url,const char ** start,const char * end,bool copy_strings_p,bool strict_uri_parsing)1173 url_parse(HdrHeap *heap, URLImpl *url, const char **start, const char *end, bool copy_strings_p, bool strict_uri_parsing)
1174 {
1175   if (strict_uri_parsing && !url_is_strictly_compliant(*start, end)) {
1176     return PARSE_RESULT_ERROR;
1177   }
1178 
1179   ParseResult zret = url_parse_scheme(heap, url, start, end, copy_strings_p);
1180   return PARSE_RESULT_CONT == zret ? url_parse_http(heap, url, start, end, copy_strings_p) : zret;
1181 }
1182 
1183 ParseResult
url_parse_no_path_component_breakdown(HdrHeap * heap,URLImpl * url,const char ** start,const char * end,bool copy_strings_p)1184 url_parse_no_path_component_breakdown(HdrHeap *heap, URLImpl *url, const char **start, const char *end, bool copy_strings_p)
1185 {
1186   ParseResult zret = url_parse_scheme(heap, url, start, end, copy_strings_p);
1187   return PARSE_RESULT_CONT == zret ? url_parse_http_no_path_component_breakdown(heap, url, start, end, copy_strings_p) : zret;
1188 }
1189 
1190 /**
1191   Parse internet URL.
1192 
1193   @verbatim
1194   [://][user[:password]@]host[:port]
1195 
1196   some.place/
1197   some.place:80/
1198   foo@some.place:80/
1199   foo:bar@some.place:80/
1200   foo:bar@some.place/
1201   foo:42@some.place/
1202   @endverbatim
1203 
1204 */
1205 
1206 ParseResult
url_parse_internet(HdrHeap * heap,URLImpl * url,const char ** start,char const * end,bool copy_strings_p)1207 url_parse_internet(HdrHeap *heap, URLImpl *url, const char **start, char const *end, bool copy_strings_p)
1208 {
1209   const char *cur = *start;
1210   const char *base;              // Base for host/port field.
1211   const char *bracket = nullptr; // marker for open bracket, if any.
1212   ts::ConstBuffer user, passw, host, port;
1213   static size_t const MAX_COLON = 8; // max # of valid colons.
1214   size_t n_colon                = 0;
1215   const char *last_colon        = nullptr; // pointer to last colon seen.
1216 
1217   // Do a quick check for "://"
1218   if (end - cur > 3 && (((':' ^ *cur) | ('/' ^ cur[1]) | ('/' ^ cur[2])) == 0)) {
1219     cur += 3;
1220   } else if (':' == *cur && (++cur >= end || ('/' == *cur && (++cur >= end || ('/' == *cur && ++cur >= end))))) {
1221     return PARSE_RESULT_ERROR;
1222   }
1223 
1224   base = cur;
1225   // skipped leading stuff, start real parsing.
1226   while (cur < end) {
1227     // Note: Each case is responsible for incrementing @a cur if
1228     // appropriate!
1229     switch (*cur) {
1230     case ']': // address close
1231       if (nullptr == bracket || n_colon >= MAX_COLON) {
1232         return PARSE_RESULT_ERROR;
1233       }
1234       ++cur;
1235       /* We keep the brackets because there are too many other places
1236          that depend on them and it's too painful to keep track if
1237          they should be used. I thought about being clever with
1238          stripping brackets from non-IPv6 content but that gets ugly
1239          as well. Just not worth it.
1240        */
1241       host.set(bracket, cur);
1242       // Spec requires This constitute the entire host so the next
1243       // character must be missing (EOS), slash, or colon.
1244       if (cur >= end || '/' == *cur) { // done which is OK
1245         last_colon = nullptr;
1246         break;
1247       } else if (':' != *cur) { // otherwise it must be a colon
1248         return PARSE_RESULT_ERROR;
1249       }
1250       /* We want to prevent more than 1 colon following so we set @a
1251          n_colon appropriately.
1252       */
1253       n_colon = MAX_COLON - 1;
1254     // FALL THROUGH
1255     case ':': // track colons, fail if too many.
1256       if (++n_colon > MAX_COLON) {
1257         return PARSE_RESULT_ERROR;
1258       }
1259       last_colon = cur;
1260       ++cur;
1261       break;
1262     case '@': // user/password marker.
1263       if (user || n_colon > 1) {
1264         return PARSE_RESULT_ERROR; // we already got one, or too many colons.
1265       }
1266       if (n_colon) {
1267         user.set(base, last_colon);
1268         passw.set(last_colon + 1, cur);
1269         n_colon    = 0;
1270         last_colon = nullptr;
1271       } else {
1272         user.set(base, cur);
1273       }
1274       ++cur;
1275       base = cur;
1276       break;
1277     case '[':                       // address open
1278       if (bracket || base != cur) { // must be first char in field
1279         return PARSE_RESULT_ERROR;
1280       }
1281       bracket = cur; // location and flag.
1282       ++cur;
1283       break;
1284     case '/':    // we're done with this phase.
1285       end = cur; // cause loop exit
1286       break;
1287     default:
1288       ++cur;
1289       break;
1290     };
1291   }
1292   // Time to pick up the pieces. At this pointer cur._ptr is the first
1293   // character past the parse area.
1294 
1295   if (user) {
1296     url_user_set(heap, url, user._ptr, user._size, copy_strings_p);
1297     if (passw) {
1298       url_password_set(heap, url, passw._ptr, passw._size, copy_strings_p);
1299     }
1300   }
1301 
1302   // @a host not set means no brackets to mark explicit host.
1303   if (!host) {
1304     if (1 == n_colon || MAX_COLON == n_colon) { // presume port.
1305       host.set(base, last_colon);
1306     } else { // it's all host.
1307       host.set(base, cur);
1308       last_colon = nullptr; // prevent port setting.
1309     }
1310   }
1311   if (host._size) {
1312     if (validate_host_name(std::string_view(host._ptr, host._size))) {
1313       url_host_set(heap, url, host._ptr, host._size, copy_strings_p);
1314     } else {
1315       return PARSE_RESULT_ERROR;
1316     }
1317   }
1318 
1319   if (last_colon) {
1320     ink_assert(n_colon);
1321     port.set(last_colon + 1, cur);
1322     if (!port._size) {
1323       return PARSE_RESULT_ERROR; // colon w/o port value.
1324     }
1325     url_port_set(heap, url, port._ptr, port._size, copy_strings_p);
1326   }
1327   if ('/' == *cur) {
1328     ++cur; // must do this after filling in host/port.
1329   }
1330   *start = cur;
1331   return PARSE_RESULT_DONE;
1332 }
1333 
1334 /*-------------------------------------------------------------------------
1335   -------------------------------------------------------------------------*/
1336 
1337 // empties params/query/fragment component
1338 
1339 ParseResult
url_parse_http(HdrHeap * heap,URLImpl * url,const char ** start,const char * end,bool copy_strings)1340 url_parse_http(HdrHeap *heap, URLImpl *url, const char **start, const char *end, bool copy_strings)
1341 {
1342   ParseResult err;
1343   const char *cur;
1344   const char *path_start     = nullptr;
1345   const char *path_end       = nullptr;
1346   const char *params_start   = nullptr;
1347   const char *params_end     = nullptr;
1348   const char *query_start    = nullptr;
1349   const char *query_end      = nullptr;
1350   const char *fragment_start = nullptr;
1351   const char *fragment_end   = nullptr;
1352   char mask;
1353 
1354   err = url_parse_internet(heap, url, start, end, copy_strings);
1355   if (err < 0) {
1356     return err;
1357   }
1358 
1359   cur = *start;
1360   if (*start == end) {
1361     goto done;
1362   }
1363 
1364   path_start = cur;
1365   mask       = ';' & '?' & '#';
1366 parse_path2:
1367   if ((*cur & mask) == mask) {
1368     if (*cur == ';') {
1369       path_end = cur;
1370       goto parse_params1;
1371     }
1372     if (*cur == '?') {
1373       path_end = cur;
1374       goto parse_query1;
1375     }
1376     if (*cur == '#') {
1377       path_end = cur;
1378       goto parse_fragment1;
1379     }
1380   } else {
1381     ink_assert((*cur != ';') && (*cur != '?') && (*cur != '#'));
1382   }
1383   GETNEXT(done);
1384   goto parse_path2;
1385 
1386 parse_params1:
1387   params_start = cur + 1;
1388   GETNEXT(done);
1389 parse_params2:
1390   if (*cur == '?') {
1391     params_end = cur;
1392     goto parse_query1;
1393   }
1394   if (*cur == '#') {
1395     params_end = cur;
1396     goto parse_fragment1;
1397   }
1398   GETNEXT(done);
1399   goto parse_params2;
1400 
1401 parse_query1:
1402   query_start = cur + 1;
1403   GETNEXT(done);
1404 parse_query2:
1405   if (*cur == '#') {
1406     query_end = cur;
1407     goto parse_fragment1;
1408   }
1409   GETNEXT(done);
1410   goto parse_query2;
1411 
1412 parse_fragment1:
1413   fragment_start = cur + 1;
1414   GETNEXT(done);
1415   fragment_end = end;
1416 
1417 done:
1418   if (path_start) {
1419     if (!path_end) {
1420       path_end = cur;
1421     }
1422     url_path_set(heap, url, path_start, path_end - path_start, copy_strings);
1423   }
1424   if (params_start) {
1425     if (!params_end) {
1426       params_end = cur;
1427     }
1428     url_params_set(heap, url, params_start, params_end - params_start, copy_strings);
1429   }
1430   if (query_start) {
1431     if (!query_end) {
1432       query_end = cur;
1433     }
1434     url_query_set(heap, url, query_start, query_end - query_start, copy_strings);
1435   }
1436   if (fragment_start) {
1437     if (!fragment_end) {
1438       fragment_end = cur;
1439     }
1440     url_fragment_set(heap, url, fragment_start, fragment_end - fragment_start, copy_strings);
1441   }
1442 
1443   *start = cur;
1444   return PARSE_RESULT_DONE;
1445 }
1446 
1447 ParseResult
url_parse_http_no_path_component_breakdown(HdrHeap * heap,URLImpl * url,const char ** start,const char * end,bool copy_strings)1448 url_parse_http_no_path_component_breakdown(HdrHeap *heap, URLImpl *url, const char **start, const char *end, bool copy_strings)
1449 {
1450   const char *cur = *start;
1451   const char *host_end;
1452 
1453   // Do a quick check for "://" - our only format check.
1454   if (end - cur > 3 && (((':' ^ *cur) | ('/' ^ cur[1]) | ('/' ^ cur[2])) == 0)) {
1455     cur += 3;
1456   } else if (':' == *cur && (++cur >= end || ('/' == *cur && (++cur >= end || ('/' == *cur && ++cur >= end))))) {
1457     return PARSE_RESULT_ERROR;
1458   }
1459 
1460   // Grab everything until EOS or slash.
1461   const char *base = cur;
1462   cur              = static_cast<const char *>(memchr(cur, '/', end - cur));
1463   if (cur) {
1464     host_end = cur;
1465     ++cur;
1466   } else {
1467     host_end = cur = end;
1468   }
1469 
1470   // Did we find something for the host?
1471   if (base != host_end) {
1472     const char *port = nullptr;
1473     int port_len     = 0;
1474 
1475     // Check for port. Search from the end stopping on the first non-digit
1476     // or more than 5 digits and a delimiter.
1477     port                   = host_end - 1;
1478     const char *port_limit = host_end - 6;
1479     if (port_limit < base) {
1480       port_limit = base; // don't go past start.
1481     }
1482 
1483     while (port >= port_limit && isdigit(*port)) {
1484       --port;
1485     }
1486 
1487     // A port if we're still in the host area and we found a ':' as
1488     // the immediately preceeding character.
1489     if (port >= base && ':' == *port) {
1490       port_len = host_end - port - 1; // must compute this first.
1491       host_end = port;                // then point at colon.
1492       ++port;                         // drop colon from port.
1493       url_port_set(heap, url, port, port_len, copy_strings);
1494     }
1495 
1496     // Now we can set the host.
1497     url_host_set(heap, url, base, host_end - base, copy_strings);
1498   }
1499 
1500   // path is anything that's left.
1501   if (cur < end) {
1502     url_path_set(heap, url, cur, end - cur, copy_strings);
1503     cur = end;
1504   }
1505   *start = cur;
1506   return PARSE_RESULT_DONE;
1507 }
1508 
1509 /*-------------------------------------------------------------------------
1510   -------------------------------------------------------------------------*/
1511 
1512 /***********************************************************************
1513  *                                                                     *
1514  *                           P R I N T I N G                           *
1515  *                                                                     *
1516  ***********************************************************************/
1517 
1518 int
url_print(URLImpl * url,char * buf_start,int buf_length,int * buf_index_inout,int * buf_chars_to_skip_inout,bool normalize)1519 url_print(URLImpl *url, char *buf_start, int buf_length, int *buf_index_inout, int *buf_chars_to_skip_inout, bool normalize)
1520 {
1521 #define TRY(x) \
1522   if (!x)      \
1523   return 0
1524 
1525   if (url->m_ptr_scheme) {
1526     TRY((normalize ? mime_mem_print_lc : mime_mem_print)(url->m_ptr_scheme, url->m_len_scheme, buf_start, buf_length,
1527                                                          buf_index_inout, buf_chars_to_skip_inout));
1528     TRY(mime_mem_print("://", 3, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1529   }
1530 
1531   if (url->m_ptr_user) {
1532     TRY(mime_mem_print(url->m_ptr_user, url->m_len_user, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1533     if (url->m_ptr_password) {
1534       TRY(mime_mem_print(":", 1, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1535       TRY(
1536         mime_mem_print(url->m_ptr_password, url->m_len_password, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1537     }
1538     TRY(mime_mem_print("@", 1, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1539   }
1540 
1541   if (url->m_ptr_host) {
1542     // Force brackets for IPv6. Note colon must occur in first 5 characters.
1543     // But it can be less (e.g. "::1").
1544     int n          = url->m_len_host;
1545     bool bracket_p = '[' != *url->m_ptr_host && (nullptr != memchr(url->m_ptr_host, ':', n > 5 ? 5 : n));
1546     if (bracket_p) {
1547       TRY(mime_mem_print("[", 1, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1548     }
1549     TRY((normalize ? mime_mem_print_lc : mime_mem_print)(url->m_ptr_host, url->m_len_host, buf_start, buf_length, buf_index_inout,
1550                                                          buf_chars_to_skip_inout));
1551     if (bracket_p) {
1552       TRY(mime_mem_print("]", 1, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1553     }
1554     if (url->m_ptr_port && url->m_port) {
1555       TRY(mime_mem_print(":", 1, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1556       TRY(mime_mem_print(url->m_ptr_port, url->m_len_port, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1557     }
1558   }
1559 
1560   TRY(mime_mem_print("/", 1, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1561 
1562   if (url->m_ptr_path) {
1563     TRY(mime_mem_print(url->m_ptr_path, url->m_len_path, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1564   }
1565 
1566   if (url->m_ptr_params && url->m_len_params > 0) {
1567     TRY(mime_mem_print(";", 1, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1568     TRY(mime_mem_print(url->m_ptr_params, url->m_len_params, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1569   }
1570 
1571   if (url->m_ptr_query && url->m_len_query > 0) {
1572     TRY(mime_mem_print("?", 1, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1573     TRY(mime_mem_print(url->m_ptr_query, url->m_len_query, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1574   }
1575 
1576   if (url->m_ptr_fragment && url->m_len_fragment > 0) {
1577     TRY(mime_mem_print("#", 1, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1578     TRY(mime_mem_print(url->m_ptr_fragment, url->m_len_fragment, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout));
1579   }
1580 
1581   return 1;
1582 
1583 #undef TRY
1584 }
1585 
1586 void
url_describe(HdrHeapObjImpl * raw,bool)1587 url_describe(HdrHeapObjImpl *raw, bool /* recurse ATS_UNUSED */)
1588 {
1589   URLImpl *obj = (URLImpl *)raw;
1590 
1591   Debug("http", "[URLTYPE: %d, SWKSIDX: %d,", obj->m_url_type, obj->m_scheme_wks_idx);
1592   Debug("http", "\tSCHEME: \"%.*s\", SCHEME_LEN: %d,", obj->m_len_scheme, (obj->m_ptr_scheme ? obj->m_ptr_scheme : "NULL"),
1593         obj->m_len_scheme);
1594   Debug("http", "\tUSER: \"%.*s\", USER_LEN: %d,", obj->m_len_user, (obj->m_ptr_user ? obj->m_ptr_user : "NULL"), obj->m_len_user);
1595   Debug("http", "\tPASSWORD: \"%.*s\", PASSWORD_LEN: %d,", obj->m_len_password,
1596         (obj->m_ptr_password ? obj->m_ptr_password : "NULL"), obj->m_len_password);
1597   Debug("http", "\tHOST: \"%.*s\", HOST_LEN: %d,", obj->m_len_host, (obj->m_ptr_host ? obj->m_ptr_host : "NULL"), obj->m_len_host);
1598   Debug("http", "\tPORT: \"%.*s\", PORT_LEN: %d, PORT_NUM: %d", obj->m_len_port, (obj->m_ptr_port ? obj->m_ptr_port : "NULL"),
1599         obj->m_len_port, obj->m_port);
1600   Debug("http", "\tPATH: \"%.*s\", PATH_LEN: %d,", obj->m_len_path, (obj->m_ptr_path ? obj->m_ptr_path : "NULL"), obj->m_len_path);
1601   Debug("http", "\tPARAMS: \"%.*s\", PARAMS_LEN: %d,", obj->m_len_params, (obj->m_ptr_params ? obj->m_ptr_params : "NULL"),
1602         obj->m_len_params);
1603   Debug("http", "\tQUERY: \"%.*s\", QUERY_LEN: %d,", obj->m_len_query, (obj->m_ptr_query ? obj->m_ptr_query : "NULL"),
1604         obj->m_len_query);
1605   Debug("http", "\tFRAGMENT: \"%.*s\", FRAGMENT_LEN: %d]", obj->m_len_fragment,
1606         (obj->m_ptr_fragment ? obj->m_ptr_fragment : "NULL"), obj->m_len_fragment);
1607 }
1608 
1609 /*-------------------------------------------------------------------------
1610   -------------------------------------------------------------------------*/
1611 
1612 /***********************************************************************
1613  *                                                                     *
1614  *                        U R L    D I G E S T S                       *
1615  *                                                                     *
1616  ***********************************************************************/
1617 
1618 static inline void
memcpy_tolower(char * d,const char * s,int n)1619 memcpy_tolower(char *d, const char *s, int n)
1620 {
1621   while (n--) {
1622     *d = ParseRules::ink_tolower(*s);
1623     s++;
1624     d++;
1625   }
1626 }
1627 
1628 #define BUFSIZE 512
1629 
1630 // fast path for CryptoHash, HTTP, no user/password/params/query,
1631 // no buffer overflow, no unescaping needed
1632 
1633 static inline void
url_CryptoHash_get_fast(const URLImpl * url,CryptoContext & ctx,CryptoHash * hash,cache_generation_t generation)1634 url_CryptoHash_get_fast(const URLImpl *url, CryptoContext &ctx, CryptoHash *hash, cache_generation_t generation)
1635 {
1636   char buffer[BUFSIZE];
1637   char *p;
1638 
1639   p = buffer;
1640   memcpy_tolower(p, url->m_ptr_scheme, url->m_len_scheme);
1641   p += url->m_len_scheme;
1642   *p++ = ':';
1643   *p++ = '/';
1644   *p++ = '/';
1645   // no user
1646   *p++ = ':';
1647   // no password
1648   *p++ = '@';
1649   memcpy_tolower(p, url->m_ptr_host, url->m_len_host);
1650   p += url->m_len_host;
1651   *p++ = '/';
1652   memcpy(p, url->m_ptr_path, url->m_len_path);
1653   p += url->m_len_path;
1654   *p++ = ';';
1655   // no params
1656   *p++ = '?';
1657   // no query
1658 
1659   ink_assert(sizeof(url->m_port) == 2);
1660   uint16_t port = static_cast<uint16_t>(url_canonicalize_port(url->m_url_type, url->m_port));
1661   *p++          = (reinterpret_cast<char *>(&port))[0];
1662   *p++          = (reinterpret_cast<char *>(&port))[1];
1663 
1664   ctx.update(buffer, p - buffer);
1665   if (generation != -1) {
1666     ctx.update(&generation, sizeof(generation));
1667   }
1668 
1669   ctx.finalize(*hash);
1670 }
1671 
1672 static inline void
url_CryptoHash_get_general(const URLImpl * url,CryptoContext & ctx,CryptoHash & hash,cache_generation_t generation)1673 url_CryptoHash_get_general(const URLImpl *url, CryptoContext &ctx, CryptoHash &hash, cache_generation_t generation)
1674 {
1675   char buffer[BUFSIZE];
1676   char *p, *e;
1677   const char *strs[13], *ends[13];
1678   const char *t;
1679   in_port_t port;
1680   int i, s;
1681 
1682   strs[0] = url->m_ptr_scheme;
1683   strs[1] = "://";
1684   strs[2] = url->m_ptr_user;
1685   strs[3] = ":";
1686   strs[4] = url->m_ptr_password;
1687   strs[5] = "@";
1688   strs[6] = url->m_ptr_host;
1689   strs[7] = "/";
1690   strs[8] = url->m_ptr_path;
1691 
1692   ends[0] = strs[0] + url->m_len_scheme;
1693   ends[1] = strs[1] + 3;
1694   ends[2] = strs[2] + url->m_len_user;
1695   ends[3] = strs[3] + 1;
1696   ends[4] = strs[4] + url->m_len_password;
1697   ends[5] = strs[5] + 1;
1698   ends[6] = strs[6] + url->m_len_host;
1699   ends[7] = strs[7] + 1;
1700   ends[8] = strs[8] + url->m_len_path;
1701 
1702   strs[9]  = ";";
1703   strs[10] = url->m_ptr_params;
1704   strs[11] = "?";
1705   strs[12] = url->m_ptr_query;
1706   ends[9]  = strs[9] + 1;
1707   ends[10] = strs[10] + url->m_len_params;
1708   ends[11] = strs[11] + 1;
1709   ends[12] = strs[12] + url->m_len_query;
1710 
1711   p = buffer;
1712   e = buffer + BUFSIZE;
1713 
1714   for (i = 0; i < 13; i++) {
1715     if (strs[i]) {
1716       t = strs[i];
1717       s = 0;
1718 
1719       while (t < ends[i]) {
1720         if ((i == 0) || (i == 6)) { // scheme and host
1721           unescape_str_tolower(p, e, t, ends[i], s);
1722         } else {
1723           unescape_str(p, e, t, ends[i], s);
1724         }
1725 
1726         if (p == e) {
1727           ctx.update(buffer, BUFSIZE);
1728           p = buffer;
1729         }
1730       }
1731     }
1732   }
1733 
1734   if (p != buffer) {
1735     ctx.update(buffer, p - buffer);
1736   }
1737   int buffer_len = static_cast<int>(p - buffer);
1738   port           = url_canonicalize_port(url->m_url_type, url->m_port);
1739 
1740   ctx.update(&port, sizeof(port));
1741   if (generation != -1) {
1742     ctx.update(&generation, sizeof(generation));
1743     Debug("url_cachekey", "Final url string for cache hash key %.*s%d%d", buffer_len, buffer, port, static_cast<int>(generation));
1744   } else {
1745     Debug("url_cachekey", "Final url string for cache hash key %.*s%d", buffer_len, buffer, port);
1746   }
1747   ctx.finalize(hash);
1748 }
1749 
1750 void
url_CryptoHash_get(const URLImpl * url,CryptoHash * hash,cache_generation_t generation)1751 url_CryptoHash_get(const URLImpl *url, CryptoHash *hash, cache_generation_t generation)
1752 {
1753   URLHashContext ctx;
1754   if ((url_hash_method != 0) && (url->m_url_type == URL_TYPE_HTTP) &&
1755       ((url->m_len_user + url->m_len_password + url->m_len_params + url->m_len_query) == 0) &&
1756       (3 + 1 + 1 + 1 + 1 + 1 + 2 + url->m_len_scheme + url->m_len_host + url->m_len_path < BUFSIZE) &&
1757       (memchr(url->m_ptr_host, '%', url->m_len_host) == nullptr) && (memchr(url->m_ptr_path, '%', url->m_len_path) == nullptr)) {
1758     url_CryptoHash_get_fast(url, ctx, hash, generation);
1759 #ifdef DEBUG
1760     CryptoHash hash_general;
1761     url_CryptoHash_get_general(url, ctx, hash_general, generation);
1762     ink_assert(*hash == hash_general);
1763 #endif
1764   } else {
1765     url_CryptoHash_get_general(url, ctx, *hash, generation);
1766   }
1767 }
1768 
1769 #undef BUFSIZE
1770 
1771 /*-------------------------------------------------------------------------
1772   -------------------------------------------------------------------------*/
1773 
1774 void
url_host_CryptoHash_get(URLImpl * url,CryptoHash * hash)1775 url_host_CryptoHash_get(URLImpl *url, CryptoHash *hash)
1776 {
1777   CryptoContext ctx;
1778 
1779   if (url->m_ptr_scheme) {
1780     ctx.update(url->m_ptr_scheme, url->m_len_scheme);
1781   }
1782 
1783   ctx.update("://", 3);
1784 
1785   if (url->m_ptr_host) {
1786     ctx.update(url->m_ptr_host, url->m_len_host);
1787   }
1788 
1789   ctx.update(":", 1);
1790 
1791   // [amc] Why is this <int> and not <in_port_t>?
1792   // Especially since it's in_port_t for url_CryptoHash_get.
1793   int port = url_canonicalize_port(url->m_url_type, url->m_port);
1794   ctx.update(&port, sizeof(port));
1795   ctx.finalize(*hash);
1796 }
1797 
1798 /*-------------------------------------------------------------------------
1799  * Regression tests
1800   -------------------------------------------------------------------------*/
1801 #if TS_HAS_TESTS
1802 #include "tscore/TestBox.h"
1803 
1804 const static struct {
1805   const char *const text;
1806   bool valid;
1807 } http_validate_hdr_field_test_case[] = {{"yahoo", true},
1808                                          {"yahoo.com", true},
1809                                          {"yahoo.wow.com", true},
1810                                          {"yahoo.wow.much.amaze.com", true},
1811                                          {"209.131.52.50", true},
1812                                          {"192.168.0.1", true},
1813                                          {"localhost", true},
1814                                          {"3ffe:1900:4545:3:200:f8ff:fe21:67cf", true},
1815                                          {"fe80:0:0:0:200:f8ff:fe21:67cf", true},
1816                                          {"fe80::200:f8ff:fe21:67cf", true},
1817                                          {"<svg onload=alert(1)>", false}, // Sample host header XSS attack
1818                                          {"jlads;f8-9349*(D&F*D(234jD*(FSD*(VKLJ#(*$@()#$)))))", false},
1819                                          {"\"\t\n", false},
1820                                          {"!@#$%^ &*(*&^%$#@#$%^&*(*&^%$#))", false},
1821                                          {":):(:O!!!!!!", false}};
1822 
REGRESSION_TEST(VALIDATE_HDR_FIELD)1823 REGRESSION_TEST(VALIDATE_HDR_FIELD)(RegressionTest *t, int /* level ATS_UNUSED */, int *pstatus)
1824 {
1825   TestBox box(t, pstatus);
1826   box = REGRESSION_TEST_PASSED;
1827 
1828   for (auto i : http_validate_hdr_field_test_case) {
1829     const char *const txt = i.text;
1830     box.check(validate_host_name({txt}) == i.valid, "Validation of FQDN (host) header: \"%s\", expected %s, but not", txt,
1831               (i.valid ? "true" : "false"));
1832   }
1833 }
1834 
REGRESSION_TEST(ParseRules_strict_URI)1835 REGRESSION_TEST(ParseRules_strict_URI)(RegressionTest *t, int /* level ATS_UNUSED */, int *pstatus)
1836 {
1837   const struct {
1838     const char *const uri;
1839     bool valid;
1840   } http_strict_uri_parsing_test_case[] = {{"/home", true},
1841                                            {"/path/data?key=value#id", true},
1842                                            {"/ABCDEFGHIJKLMNOPQRSTUVWXYZ", true},
1843                                            {"/abcdefghijklmnopqrstuvwxyz", true},
1844                                            {"/0123456789", true},
1845                                            {":/?#[]@", true},
1846                                            {"!$&'()*+,;=", true},
1847                                            {"-._~", true},
1848                                            {"%", true},
1849                                            {"\n", false},
1850                                            {"\"", false},
1851                                            {"<", false},
1852                                            {">", false},
1853                                            {"\\", false},
1854                                            {"^", false},
1855                                            {"`", false},
1856                                            {"{", false},
1857                                            {"|", false},
1858                                            {"}", false},
1859                                            {"é", false}};
1860 
1861   TestBox box(t, pstatus);
1862   box = REGRESSION_TEST_PASSED;
1863 
1864   for (auto i : http_strict_uri_parsing_test_case) {
1865     const char *const uri = i.uri;
1866     box.check(url_is_strictly_compliant(uri, uri + strlen(uri)) == i.valid, "Strictly parse URI: \"%s\", expected %s, but not", uri,
1867               (i.valid ? "true" : "false"));
1868   }
1869 }
1870 
1871 #endif // TS_HAS_TESTS
1872