bes  Updated for version 3.20.8
CurlUtils.cc
1 // -*- mode: c++; c-basic-offset:4 -*-
2 // This file is part of the BES http package, part of the Hyrax data server.
3 //
4 // Copyright (c) 2020 OPeNDAP, Inc.
5 // Author: Nathan Potter <ndp@opendap.org>
6 //
7 // This library is free software; you can redistribute it and/or
8 // modify it under the terms of the GNU Lesser General Public
9 // License as published by the Free Software Foundation; either
10 // version 2.1 of the License, or (at your option) any later version.
11 //
12 // This library is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 // Lesser General Public License for more details.
16 //
17 // You should have received a copy of the GNU Lesser General Public
18 // License along with this library; if not, write to the Free Software
19 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 //
21 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
22 // Authors:
23 // ndp Nathan Potter <ndp@opendap.org>
24 #include "config.h"
25 
26 #include <curl/curl.h>
27 #include <cstdio>
28 #include <sstream>
29 #include <map>
30 #include <vector>
31 #include <unistd.h>
32 #include <algorithm> // std::for_each
33 #include <time.h>
34 #include <BESContextManager.h>
35 
36 #include "rapidjson/document.h"
37 
38 
39 #include "BESSyntaxUserError.h"
40 #include "BESForbiddenError.h"
41 #include "BESNotFoundError.h"
42 #include "BESTimeoutError.h"
43 #include "BESInternalError.h"
44 #include "BESDebug.h"
45 #include "BESRegex.h"
46 #include "TheBESKeys.h"
47 #include "BESUtil.h"
48 #include "BESLog.h"
49 #include "BESStopWatch.h"
50 
51 #include "BESSyntaxUserError.h"
52 #include "HttpNames.h"
53 #include "HttpUtils.h"
54 #include "ProxyConfig.h"
55 #include "AllowedHosts.h"
56 #include "CurlUtils.h"
57 #include "EffectiveUrlCache.h"
58 
59 #include "url_impl.h"
60 
61 #define MODULE "curl"
62 
63 using std::endl;
64 using std::string;
65 using std::map;
66 using std::vector;
67 using std::stringstream;
68 using std::ostringstream;
69 using namespace http;
70 
71 #define prolog std::string("CurlUtils::").append(__func__).append("() - ")
72 
73 namespace curl {
74 
75 static const unsigned int retry_limit = 10; // Amazon's suggestion
76 static const useconds_t uone_second = 1000 * 1000; // one second in micro seconds (which is 1000
77 
78 // Forward declaration
79 curl_slist *add_auth_headers(struct curl_slist *request_headers);
80 
81 // Set this to 1 to turn on libcurl's verbose mode (for debugging).
82 int curl_trace = 0;
83 
84 #define CLIENT_ERR_MIN 400
85 #define CLIENT_ERR_MAX 417
86 const char *http_client_errors[CLIENT_ERR_MAX - CLIENT_ERR_MIN + 1] = {
87  "Bad Request:",
88  "Unauthorized: Contact the server administrator.",
89  "Payment Required.",
90  "Forbidden: Contact the server administrator.",
91  "Not Found: The underlying data source or server could not be found.",
92  "Method Not Allowed.",
93  "Not Acceptable.",
94  "Proxy Authentication Required.",
95  "Request Time-out.",
96  "Conflict.",
97  "Gone.",
98  "Length Required.",
99  "Precondition Failed.",
100  "Request Entity Too Large.",
101  "Request URI Too Large.",
102  "Unsupported Media Type.",
103  "Requested Range Not Satisfiable.",
104  "Expectation Failed."
105 };
106 
107 #define SERVER_ERR_MIN 500
108 #define SERVER_ERR_MAX 505
109 const char *http_server_errors[SERVER_ERR_MAX - SERVER_ERR_MIN + 1] =
110  {
111  "Internal Server Error.",
112  "Not Implemented.",
113  "Bad Gateway.",
114  "Service Unavailable.",
115  "Gateway Time-out.",
116  "HTTP Version Not Supported."
117  };
118 
127 string http_status_to_string(int status) {
128  if (status >= CLIENT_ERR_MIN && status <= CLIENT_ERR_MAX)
129  return string(http_client_errors[status - CLIENT_ERR_MIN]);
130  else if (status >= SERVER_ERR_MIN && status <= SERVER_ERR_MAX)
131  return string(http_server_errors[status - SERVER_ERR_MIN]);
132  else {
133  stringstream msg;
134  msg << "Unknown HTTP Error: " << status;
135  return msg.str();
136  }
137 }
138 
144 static string getCurlAuthTypeName(const int auth_type) {
145 
146  string authTypeString;
147  int match;
148 
149  match = auth_type & CURLAUTH_BASIC;
150  if (match) {
151  authTypeString += "CURLAUTH_BASIC";
152  }
153 
154  match = auth_type & CURLAUTH_DIGEST;
155  if (match) {
156  if (!authTypeString.empty())
157  authTypeString += " ";
158  authTypeString += "CURLAUTH_DIGEST";
159  }
160 
161  match = auth_type & CURLAUTH_DIGEST_IE;
162  if (match) {
163  if (!authTypeString.empty())
164  authTypeString += " ";
165  authTypeString += "CURLAUTH_DIGEST_IE";
166  }
167 
168  match = auth_type & CURLAUTH_GSSNEGOTIATE;
169  if (match) {
170  if (!authTypeString.empty())
171  authTypeString += " ";
172  authTypeString += "CURLAUTH_GSSNEGOTIATE";
173  }
174 
175  match = auth_type & CURLAUTH_NTLM;
176  if (match) {
177  if (!authTypeString.empty())
178  authTypeString += " ";
179  authTypeString += "CURLAUTH_NTLM";
180  }
181 
182 #if 0
183  match = auth_type & CURLAUTH_ANY;
184  if(match){
185  if(!authTypeString.empty())
186  authTypeString += " ";
187  authTypeString += "CURLAUTH_ANY";
188  }
189 
190 
191  match = auth_type & CURLAUTH_ANY;
192  if(match){
193  if(!authTypeString.empty())
194  authTypeString += " ";
195  authTypeString += "CURLAUTH_ANYSAFE";
196  }
197 
198 
199  match = auth_type & CURLAUTH_ANY;
200  if(match){
201  if(!authTypeString.empty())
202  authTypeString += " ";
203  authTypeString += "CURLAUTH_ONLY";
204  }
205 #endif
206 
207  return authTypeString;
208 }
209 
213 static size_t writeNothing(char */* data */, size_t /* size */, size_t nmemb, void * /* userdata */) {
214  return nmemb;
215 }
216 
221  static size_t writeToOpenFileDescriptor(char *data, size_t /* size */, size_t nmemb, void *userdata) {
222 
223  int *fd = (int *) userdata;
224 
225  BESDEBUG(MODULE, prolog << "Bytes received " << nmemb << endl);
226  int wrote = write(*fd, data, nmemb);
227  BESDEBUG(MODULE, prolog << "Bytes written " << wrote << endl);
228 
229  return wrote;
230  }
231 
232 
255 static size_t save_http_response_headers(void *ptr, size_t size, size_t nmemb, void *resp_hdrs) {
256  BESDEBUG(MODULE, prolog << "Inside the header parser." << endl);
257  vector<string> *hdrs = static_cast<vector<string> * >(resp_hdrs);
258 
259  // Grab the header, minus the trailing newline. Or \r\n pair.
260  string complete_line;
261  if (nmemb > 1 && *(static_cast<char *>(ptr) + size * (nmemb - 2)) == '\r')
262  complete_line.assign(static_cast<char *>(ptr), size * (nmemb - 2));
263  else
264  complete_line.assign(static_cast<char *>(ptr), size * (nmemb - 1));
265 
266  // Store all non-empty headers that are not HTTP status codes
267  if (complete_line != "" && complete_line.find("HTTP") == string::npos) {
268  BESDEBUG(MODULE, prolog << "Header line: " << complete_line << endl);
269  hdrs->push_back(complete_line);
270  }
271 
272  return size * nmemb;
273 }
274 
275 
283 static int curl_debug(CURL *, curl_infotype info, char *msg, size_t size, void *) {
284  string message(msg, size);
285 
286  switch (info) {
287  case CURLINFO_TEXT:
288  BESDEBUG(MODULE, prolog << "Text: " << message << endl);
289  break;
290  case CURLINFO_HEADER_IN:
291  BESDEBUG(MODULE, prolog << "Header in: " << message << endl);
292  break;
293  case CURLINFO_HEADER_OUT:
294  BESDEBUG(MODULE, prolog << "Header out: " << endl << message << endl);
295  break;
296  case CURLINFO_DATA_IN:
297  BESDEBUG(MODULE, prolog << "Data in: " << message << endl);
298  break;
299  case CURLINFO_DATA_OUT:
300  BESDEBUG(MODULE, prolog << "Data out: " << message << endl);
301  break;
302  case CURLINFO_END:
303  BESDEBUG(MODULE, prolog << "End: " << message << endl);
304  break;
305 #ifdef CURLINFO_SSL_DATA_IN
306  case CURLINFO_SSL_DATA_IN:
307  BESDEBUG(MODULE, prolog << "SSL Data in: " << message << endl ); break;
308 #endif
309 #ifdef CURLINFO_SSL_DATA_OUT
310  case CURLINFO_SSL_DATA_OUT:
311  BESDEBUG(MODULE, prolog << "SSL Data out: " << message << endl ); break;
312 #endif
313  default:
314  BESDEBUG(MODULE, prolog << "Curl info: " << message << endl);
315  break;
316  }
317  return 0;
318 }
319 
320 
326 class BuildHeaders : public std::unary_function<const string &, void> {
327  struct curl_slist *d_cl;
328 
329 public:
330  BuildHeaders() : d_cl(0) {}
331 
332  void operator()(const string &header) {
333  BESDEBUG(MODULE, prolog << "Adding '" << header.c_str() << "' to the header list." << endl);
334  d_cl = curl_slist_append(d_cl, header.c_str());
335  }
336 
337  struct curl_slist *get_headers() {
338  return d_cl;
339  }
340 };
341 
362  bool configure_curl_handle_for_proxy(CURL *ceh, const string &target_url) {
363  BESDEBUG(MODULE, prolog << "BEGIN." << endl);
364 
365  bool using_proxy = http::ProxyConfig::theOne()->is_configured();
366  if (using_proxy) {
367 
368  BESDEBUG(MODULE, prolog << "Proxy has been configured..." << endl);
369 
370  http::ProxyConfig *proxy = http::ProxyConfig::theOne();
371 
372  // TODO remove these local variables (if possible) and pass the values into curl_easy_setopt() directly from HttpUtils
373  string proxyHost = proxy->host();
374  int proxyPort = proxy->port();
375  string proxyPassword = proxy->proxy_password();
376  string proxyUser = proxy->user();
377  string proxyUserPW = proxy->password();
378  int proxyAuthType = proxy->auth_type();
379  string no_proxy_regex = proxy->no_proxy_regex();
380 
381 
382  // Don't set up the proxy server for URLs that match the 'NoProxy'
383  // regex set in the gateway.conf file.
384 
385  // Don't create the regex if the string is empty
386  if (!no_proxy_regex.empty()) {
387  BESDEBUG(MODULE, prolog << "Found NoProxyRegex." << endl);
388  BESRegex r(no_proxy_regex.c_str());
389  if (r.match(target_url.c_str(), target_url.length()) != -1) {
390  BESDEBUG(MODULE,
391  prolog << "Found NoProxy match. Regex: " << no_proxy_regex << "; Url: " << target_url
392  << endl);
393  using_proxy = false;
394  }
395  }
396 
397  if (using_proxy) {
398  CURLcode res;
399  char error_buffer[CURL_ERROR_SIZE];
400 
401  BESDEBUG(MODULE, prolog << "Setting up a proxy server." << endl);
402  BESDEBUG(MODULE, prolog << "Proxy host: " << proxyHost << endl);
403  BESDEBUG(MODULE, prolog << "Proxy port: " << proxyPort << endl);
404 
405  set_error_buffer(ceh, error_buffer);
406 
407  res = curl_easy_setopt(ceh, CURLOPT_PROXY, proxyHost.data());
408  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXY", error_buffer, __FILE__, __LINE__);
409 
410  res = curl_easy_setopt(ceh, CURLOPT_PROXYPORT, proxyPort);
411  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXYPORT", error_buffer, __FILE__, __LINE__);
412 
413  // oddly "#ifdef CURLOPT_PROXYAUTH" doesn't work - even though CURLOPT_PROXYAUTH is defined and valued at 111 it
414  // fails the test. Eclipse hover over the CURLOPT_PROXYAUTH symbol shows: "CINIT(PROXYAUTH, LONG, 111)",
415  // for what that's worth
416 
417  // According to http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTPROXYAUTH
418  // As of 4/21/08 only NTLM, Digest and Basic work.
419 
420  res = curl_easy_setopt(ceh, CURLOPT_PROXYAUTH, proxyAuthType);
421  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXYAUTH", error_buffer, __FILE__, __LINE__);
422  BESDEBUG(MODULE, prolog << "Using CURLOPT_PROXYAUTH = " << getCurlAuthTypeName(proxyAuthType) << endl);
423 
424  if (!proxyUser.empty()) {
425  res = curl_easy_setopt(ceh, CURLOPT_PROXYUSERNAME, proxyUser.data());
426  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXYUSERNAME", error_buffer, __FILE__,
427  __LINE__);
428  BESDEBUG(MODULE, prolog << "CURLOPT_PROXYUSERNAME : " << proxyUser << endl);
429 
430  if (!proxyPassword.empty()) {
431  res = curl_easy_setopt(ceh, CURLOPT_PROXYPASSWORD, proxyPassword.data());
432  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXYPASSWORD", error_buffer, __FILE__,
433  __LINE__);
434  BESDEBUG(MODULE, prolog << "CURLOPT_PROXYPASSWORD: " << proxyPassword << endl);
435  }
436  } else if (!proxyUserPW.empty()) {
437  res = curl_easy_setopt(ceh, CURLOPT_PROXYUSERPWD, proxyUserPW.data());
438  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXYUSERPWD", error_buffer, __FILE__, __LINE__);
439  BESDEBUG(MODULE, prolog << "CURLOPT_PROXYUSERPWD : " << proxyUserPW << endl);
440  }
441  unset_error_buffer(ceh);
442  }
443  }
444  BESDEBUG(MODULE, prolog << "END. using_proxy: " << (using_proxy ? "true" : "false") << endl);
445  return using_proxy;
446  }
447 
448 #if 0
449  bool configure_curl_handle_for_proxy(CURL *ceh, const string &target_url) {
450  BESDEBUG(MODULE, prolog << "BEGIN." << endl);
451 
452  bool using_proxy = false;
453 
454  http::ProxyConfig *proxy = http::ProxyConfig::TheConfig();
455 
456  // TODO remove these local variables (if possible) and pass the values into curl_easy_setopt() directly from HttpUtils
457  string proxyHost = proxy->host();
458  int proxyPort = proxy->port();
459  string proxyPassword = proxy->proxy_password();
460  string proxyUser = proxy->user();
461  string proxyUserPW = proxy->password();
462  int proxyAuthType = proxy->auth_type();
463  string no_proxy_regex = proxy->no_proxy_regex();
464 
465  if (!proxyHost.empty()) {
466  using_proxy = true;
467  if (proxyPort == 0)
468  proxyPort = 8080;
469 
470  // Apparently we don't need this...
471  //if(proxyProtocol.empty())
472  // proxyProtocol = "http";
473 
474  }
475  if (using_proxy) {
476  BESDEBUG(MODULE, prolog << "Found proxy configuration." << endl);
477 
478  // Don't set up the proxy server for URLs that match the 'NoProxy'
479  // regex set in the gateway.conf file.
480 
481  // Don't create the regex if the string is empty
482  if (!no_proxy_regex.empty()) {
483  BESDEBUG(MODULE, prolog << "Found NoProxyRegex." << endl);
484  BESRegex r(no_proxy_regex.c_str());
485  if (r.match(target_url.c_str(), target_url.length()) != -1) {
486  BESDEBUG(MODULE,
487  prolog << "Found NoProxy match. Regex: " << no_proxy_regex << "; Url: " << target_url
488  << endl);
489  using_proxy = false;
490  }
491  }
492 
493  if (using_proxy) {
494  CURLcode res;
495  char error_buffer[CURL_ERROR_SIZE];
496 
497  BESDEBUG(MODULE, prolog << "Setting up a proxy server." << endl);
498  BESDEBUG(MODULE, prolog << "Proxy host: " << proxyHost << endl);
499  BESDEBUG(MODULE, prolog << "Proxy port: " << proxyPort << endl);
500 
501  set_error_buffer(ceh, error_buffer);
502 
503  res = curl_easy_setopt(ceh, CURLOPT_PROXY, proxyHost.data());
504  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXY", error_buffer, __FILE__, __LINE__);
505 
506  res = curl_easy_setopt(ceh, CURLOPT_PROXYPORT, proxyPort);
507  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXYPORT", error_buffer, __FILE__, __LINE__);
508 
509  // oddly "#ifdef CURLOPT_PROXYAUTH" doesn't work - even though CURLOPT_PROXYAUTH is defined and valued at 111 it
510  // fails the test. Eclipse hover over the CURLOPT_PROXYAUTH symbol shows: "CINIT(PROXYAUTH, LONG, 111)",
511  // for what that's worth
512 
513  // According to http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTPROXYAUTH
514  // As of 4/21/08 only NTLM, Digest and Basic work.
515 
516  res = curl_easy_setopt(ceh, CURLOPT_PROXYAUTH, proxyAuthType);
517  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXYAUTH", error_buffer, __FILE__, __LINE__);
518  BESDEBUG(MODULE, prolog << "Using CURLOPT_PROXYAUTH = " << getCurlAuthTypeName(proxyAuthType) << endl);
519 
520  if (!proxyUser.empty()) {
521  res = curl_easy_setopt(ceh, CURLOPT_PROXYUSERNAME, proxyUser.data());
522  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXYUSERNAME", error_buffer, __FILE__, __LINE__);
523  BESDEBUG(MODULE, prolog << "CURLOPT_PROXYUSERNAME : " << proxyUser << endl);
524 
525  if (!proxyPassword.empty()) {
526  res = curl_easy_setopt(ceh, CURLOPT_PROXYPASSWORD, proxyPassword.data());
527  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXYPASSWORD", error_buffer, __FILE__,
528  __LINE__);
529  BESDEBUG(MODULE, prolog << "CURLOPT_PROXYPASSWORD: " << proxyPassword << endl);
530  }
531  }
532  else if (!proxyUserPW.empty()) {
533  res = curl_easy_setopt(ceh, CURLOPT_PROXYUSERPWD, proxyUserPW.data());
534  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_PROXYUSERPWD", error_buffer, __FILE__, __LINE__);
535  BESDEBUG(MODULE, prolog << "CURLOPT_PROXYUSERPWD : " << proxyUserPW << endl);
536  }
537  unset_error_buffer(ceh);
538  }
539  }
540  BESDEBUG(MODULE, prolog << "END." << endl);
541 
542  return using_proxy;
543  }
544 #endif
545 
546 
547 
548 CURL *init(const string &target_url,
549  const struct curl_slist *http_request_headers,
550  vector<string> *http_response_hdrs) {
551  CURL *swanky_new_curl_easy_handle = curl_easy_init();
552  return init(swanky_new_curl_easy_handle, target_url, http_request_headers, http_response_hdrs);
553 }
554 
569 CURL *init(CURL *ceh,
570  const string &target_url,
571  const struct curl_slist *http_request_headers,
572  vector<string> *http_response_hdrs
573 ) {
574  char error_buffer[CURL_ERROR_SIZE];
575  error_buffer[0] = 0; // Null terminate this string for safety.
576  CURLcode res;
577 
578  if (!ceh)
579  throw BESInternalError("Could not initialize cURL easy handle.", __FILE__, __LINE__);
580 
581  // SET Error Buffer (for use during this setup) ----------------------------------------------------------------
582  set_error_buffer(ceh, error_buffer);
583 
584  // Target URL --------------------------------------------------------------------------------------------------
585  res = curl_easy_setopt(ceh, CURLOPT_URL, target_url.c_str());
586  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_URL", error_buffer, __FILE__, __LINE__);
587 
588  // Load in the default headers to send with a request. The empty Pragma
589  // headers overrides libcurl's default Pragma: no-cache header (which
590  // will disable caching by Squid, etc.).
591  // the empty Pragma never appears in the outgoing headers when this isn't present
592  // d_request_headers->push_back(string("Pragma: no-cache"));
593  // d_request_headers->push_back(string("Cache-Control: no-cache"));
594 
595  //TODO Do we need this test? what if the pointer is null? Probably it's fine...
596  if (http_request_headers) {
597  // Add the http_request_headers to the cURL handle.
598  res = curl_easy_setopt(ceh, CURLOPT_HTTPHEADER, http_request_headers);
599  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_HTTPHEADER", error_buffer, __FILE__, __LINE__);
600  }
601 
602 
603  if (http_response_hdrs) {
604  res = curl_easy_setopt(ceh, CURLOPT_HEADERFUNCTION, save_http_response_headers);
605  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_HEADERFUNCTION", error_buffer, __FILE__, __LINE__);
606 
607  // Pass save_http_response_headers() a pointer to the vector<string> where the
608  // response headers may be stored. Callers can use the resp_hdrs
609  // value/result parameter to get the raw response header information .
610  res = curl_easy_setopt(ceh, CURLOPT_WRITEHEADER, http_response_hdrs);
611  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_WRITEHEADER", error_buffer, __FILE__, __LINE__);
612  }
613 
614  // Allow compressed responses. Sending an empty string enables all supported compression types.
615 #ifndef CURLOPT_ACCEPT_ENCODING
616  res = curl_easy_setopt(ceh, CURLOPT_ENCODING, "");
617  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_ENCODING", error_buffer, __FILE__, __LINE__);
618 #else
619  res = curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
620  check_setopt_result(res, prolog, "CURLOPT_ACCEPT_ENCODING", error_buffer, __FILE__,__LINE__);
621 #endif
622  // Disable Progress Meter
623  res = curl_easy_setopt(ceh, CURLOPT_NOPROGRESS, 1L);
624  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_NOPROGRESS", error_buffer, __FILE__, __LINE__);
625 
626  // Disable cURL signal handling
627  res = curl_easy_setopt(ceh, CURLOPT_NOSIGNAL, 1L);
628  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_NOSIGNAL", error_buffer, __FILE__, __LINE__);
629 
630 
631  // - - - - - - - - - - - - - - - - - - - - - - - - - - - -
632  // Authentication config.
633  //
634 
635  // We have to set FailOnError to false for any of the non-Basic
636  // authentication schemes to work. 07/28/03 jhrg
637  res = curl_easy_setopt(ceh, CURLOPT_FAILONERROR, 0L);
638  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_FAILONERROR", error_buffer, __FILE__, __LINE__);
639 
640 
641  // CURLAUTH_ANY means libcurl will use Basic, Digest, GSS Negotiate, or NTLM,
642  // choosing the the 'safest' one supported by the server.
643  // This requires curl 7.10.6 which is still in pre-release. 07/25/03 jhrg
644  res = curl_easy_setopt(ceh, CURLOPT_HTTPAUTH, (long) CURLAUTH_ANY);
645  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_HTTPAUTH", error_buffer, __FILE__, __LINE__);
646 
647 
648  // CURLOPT_NETRC means to use the netrc file for credentials.
649  // CURL_NETRC_OPTIONAL Means that if the supplied URL contains a username
650  // and password to prefer that to using the content of the netrc file.
651  res = curl_easy_setopt(ceh, CURLOPT_NETRC, CURL_NETRC_OPTIONAL);
652  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_NETRC", error_buffer, __FILE__, __LINE__);
653 
654  // If the configuration specifies a particular .netrc credentials file, use it.
655  string netrc_file = get_netrc_filename();
656  if (!netrc_file.empty()) {
657  res = curl_easy_setopt(ceh, CURLOPT_NETRC_FILE, netrc_file.c_str());
658  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_NETRC_FILE", error_buffer, __FILE__, __LINE__);
659 
660  }
661  VERBOSE(prolog << " is using the netrc file '"
662  << ((!netrc_file.empty()) ? netrc_file : "~/.netrc") << "'" << endl);
663 
664 
665  // - - - - - - - - - - - - - - - - - - - - - - - - - - - -
666  // Cookies
667  //
668  res = curl_easy_setopt(ceh, CURLOPT_COOKIEFILE, curl::get_cookie_filename().c_str());
669  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_COOKIEFILE", error_buffer, __FILE__, __LINE__);
670 
671  res = curl_easy_setopt(ceh, CURLOPT_COOKIEJAR, curl::get_cookie_filename().c_str());
672  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_COOKIEJAR", error_buffer, __FILE__, __LINE__);
673 
674  // save_http_response_headers
675 
676  // Follow 302 (redirect) responses
677  res = curl_easy_setopt(ceh, CURLOPT_FOLLOWLOCATION, 1L);
678  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_FOLLOWLOCATION", error_buffer, __FILE__, __LINE__);
679 
680  res = curl_easy_setopt(ceh, CURLOPT_MAXREDIRS, max_redirects());
681  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_MAXREDIRS", error_buffer, __FILE__, __LINE__);
682 
683  // Set the user agent to Hyrax's user agent value
684  res = curl_easy_setopt(ceh, CURLOPT_USERAGENT, hyrax_user_agent().c_str());
685  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_USERAGENT", error_buffer, __FILE__, __LINE__);
686 
687 #if 0
688  // If the user turns off SSL validation...
689 if (!d_rcr->get_validate_ssl() == 0) {
690  res = curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
691  check_setopt_result(res, prolog, "CURLOPT_SSL_VERIFYPEER", error_buffer, __FILE__, __LINE__);
692  res = curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
693  check_setopt_result(res, prolog, "CURLOPT_SSL_VERIFYHOST", error_buffer, __FILE__, __LINE__);
694 }
695 #endif
696 
697  if (curl_trace) {
698  BESDEBUG(MODULE, prolog << "Curl version: " << curl_version() << endl);
699  res = curl_easy_setopt(ceh, CURLOPT_VERBOSE, 1L);
700  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_VERBOSE", error_buffer, __FILE__, __LINE__);
701  BESDEBUG(MODULE, prolog << "Curl in verbose mode." << endl);
702 
703  res = curl_easy_setopt(ceh, CURLOPT_DEBUGFUNCTION, curl_debug);
704  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_DEBUGFUNCTION", error_buffer, __FILE__, __LINE__);
705  BESDEBUG(MODULE, prolog << "Curl debugging function installed." << endl);
706  }
707 
708  // We unset the error buffer here because we know that curl::configure_curl_handle_for_proxy() will use it's own.
709  unset_error_buffer(ceh);
710  // Configure the a proxy for this url (if appropriate).
711  curl::configure_curl_handle_for_proxy(ceh, target_url);
712 
713  BESDEBUG(MODULE, prolog << "curl: " << (void *) ceh << endl);
714  return ceh;
715 }
716 
717 string get_range_arg_string(const unsigned long long &offset, const unsigned long long &size) {
718  ostringstream range; // range-get needs a string arg for the range
719  range << offset << "-" << offset + size - 1;
720  BESDEBUG(MODULE, prolog << " range: " << range.str() << endl);
721  return range.str();
722 }
723 
739 CURL *init_effective_url_retriever_handle(const string &target_url, struct curl_slist *req_headers,
740  vector<string> &resp_hdrs) {
741  char error_buffer[CURL_ERROR_SIZE];
742  CURLcode res;
743  CURL *ceh = 0;
744 
745  error_buffer[0] = 0; // null terminate empty string
746 
747  ceh = curl::init(target_url, req_headers, &resp_hdrs);
748 
749  set_error_buffer(ceh, error_buffer);
750 
751  // get the offset to offset + size bytes
752  res = curl_easy_setopt(ceh, CURLOPT_RANGE, get_range_arg_string(0, 4).c_str());
753  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_RANGE", error_buffer, __FILE__, __LINE__);
754 
755  res = curl_easy_setopt(ceh, CURLOPT_WRITEFUNCTION, writeNothing);
756  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_WRITEFUNCTION", error_buffer, __FILE__, __LINE__);
757 
758  // Pass save_raw_http_headers() a pointer to the vector<string> where the
759  // response headers may be stored. Callers can use the resp_hdrs
760  // value/result parameter to get the raw response header information .
761  res = curl_easy_setopt(ceh, CURLOPT_WRITEHEADER, &resp_hdrs);
762  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_WRITEHEADER", error_buffer, __FILE__, __LINE__);
763 
764  unset_error_buffer(ceh);
765 
766  return ceh;
767 }
768 
786 void http_get_and_write_resource(const string &target_url,
787  const int fd,
788  vector<string> *http_response_headers) {
789 
790  char error_buffer[CURL_ERROR_SIZE];
791  CURLcode res;
792  CURL *ceh = NULL;
793  curl_slist *req_headers = NULL;
794  BuildHeaders header_builder;
795 
796  BESDEBUG(MODULE, prolog << "BEGIN" << endl);
797  // Before we do anything, make sure that the URL is OK to pursue.
798  if (!bes::AllowedHosts::theHosts()->is_allowed(target_url)) {
799  string err = (string) "The specified URL " + target_url
800  + " does not match any of the accessible services in"
801  + " the allowed hosts list.";
802  BESDEBUG(MODULE, prolog << err << endl);
803  throw BESSyntaxUserError(err, __FILE__, __LINE__);
804  }
805 
806  // Add the authorization headers
807  req_headers = add_auth_headers(req_headers);
808 
809  try {
810  // OK! Make the cURL handle
811  ceh = init(target_url, req_headers, http_response_headers);
812 
813  set_error_buffer(ceh, error_buffer);
814 
815  res = curl_easy_setopt(ceh, CURLOPT_WRITEFUNCTION, writeToOpenFileDescriptor);
816  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_WRITEFUNCTION", error_buffer, __FILE__, __LINE__);
817 
818 #ifdef CURLOPT_WRITEDATA
819  res = curl_easy_setopt(ceh, CURLOPT_WRITEDATA, &fd);
820  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_WRITEDATA", error_buffer, __FILE__, __LINE__);
821 #else
822  res = curl_easy_setopt(ceh, CURLOPT_FILE, &fd);
823  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_FILE", error_buffer, __FILE__, __LINE__);
824 #endif
825  unset_error_buffer(ceh);
826 
827  super_easy_perform(ceh);
828 
829  // Free the header list
830  if (req_headers)
831  curl_slist_free_all(req_headers);
832  if (ceh)
833  curl_easy_cleanup(ceh);
834  BESDEBUG(MODULE, prolog << "Called curl_easy_cleanup()." << endl);
835  }
836  catch (...) {
837  if (req_headers)
838  curl_slist_free_all(req_headers);
839  if (ceh)
840  curl_easy_cleanup(ceh);
841  throw;
842  }
843  BESDEBUG(MODULE, prolog << "END" << endl);
844 }
845 
853 string error_message(const CURLcode response_code, char *error_buffer) {
854  std::ostringstream oss;
855  size_t len = strlen(error_buffer);
856  if (len) {
857  oss << "cURL_error_buffer: '" << error_buffer;
858  }
859  oss << "' cURL_message: '" << curl_easy_strerror(response_code);
860  oss << "' (code: " << (int) response_code << ")";
861  return oss.str();
862 }
863 
864 /*
865 * @brief Callback passed to libcurl to handle reading a single byte.
866 *
867 * This callback assumes that the size of the data is small enough
868 * that all of the bytes will be either read at once or that a local
869  * temporary buffer can be used to build up the values.
870 *
871 * @param buffer Data from libcurl
872 * @param size Number of bytes
873 * @param nmemb Total size of data in this call is 'size * nmemb'
874 * @param data Pointer to this
875 * @return The number of bytes read
876 */
877 size_t c_write_data(void *buffer, size_t size, size_t nmemb, void *data) {
878  size_t nbytes = size * nmemb;
879  //cerr << "ngap_write_data() bytes: " << nbytes << " size: " << size << " nmemb: " << nmemb << " buffer: " << buffer << " data: " << data << endl;
880  memcpy(data, buffer, nbytes);
881  return nbytes;
882 }
883 
890 std::string http_get_as_string(const std::string &target_url) {
891 
892  // @TODO @FIXME Make the size of this buffer one of:
893  // a) A configuration setting.
894  // b) A new parameter to the function. (unsigned long)
895  // c) Do a HEAD on the URL, check for the Content-Length header and plan accordingly.
896  //
897  char response_buf[1024 * 1024];
898 
899  http_get(target_url, response_buf);
900  string response(response_buf);
901  return response;
902 }
903 
911 rapidjson::Document http_get_as_json(const std::string &target_url) {
912 
913  // @TODO @FIXME Make the size of this buffer one of:
914  // a) A configuration setting.
915  // b) A new parameter to the function. (unsigned long)
916  // c) Do a HEAD on the URL, check for the Content-Length header and plan accordingly.
917  //
918 
919  char response_buf[1024 * 1024];
920 
921  curl::http_get(target_url, response_buf);
923  d.Parse(response_buf);
924  return d;
925 }
926 
932 void http_get(const std::string &target_url, char *response_buf) {
933 
934  char errbuf[CURL_ERROR_SIZE];
935  CURL *ceh = NULL;
936  CURLcode res;
937 
938  curl_slist *request_headers = NULL;
939  // Add the authorization headers
940  request_headers = add_auth_headers(request_headers);
941 
942  try {
943 
944  ceh = curl::init(target_url, request_headers, NULL);
945  if (!ceh)
946  throw BESInternalError(string("ERROR! Failed to acquire cURL Easy Handle! "), __FILE__, __LINE__);
947 
948  // Error Buffer (for use during this setup) ----------------------------------------------------------------
949  set_error_buffer(ceh, errbuf);
950 
951  // Pass all data to the 'write_data' function --------------------------------------------------------------
952  res = curl_easy_setopt(ceh, CURLOPT_WRITEFUNCTION, c_write_data);
953  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_WRITEFUNCTION", errbuf, __FILE__, __LINE__);
954 
955  // Pass this to write_data as the fourth argument ----------------------------------------------------------
956  res = curl_easy_setopt(ceh, CURLOPT_WRITEDATA, reinterpret_cast<void *>(response_buf));
957  eval_curl_easy_setopt_result(res, prolog, "CURLOPT_WRITEDATA", errbuf, __FILE__, __LINE__);
958 
959  unset_error_buffer(ceh);
960 
961  super_easy_perform(ceh);
962 
963  if (request_headers)
964  curl_slist_free_all(request_headers);
965  if (ceh)
966  curl_easy_cleanup(ceh);
967  }
968  catch (...) {
969  if (request_headers)
970  curl_slist_free_all(request_headers);
971  if (ceh)
972  curl_easy_cleanup(ceh);
973  }
974 }
975 
976 #if 0
984 CURL *set_up_easy_handle(const string &target_url, struct curl_slist *request_headers, char *response_buff) {
985  char errbuf[CURL_ERROR_SIZE];
986  CURL *d_handle;
987  CURLcode res;
988 
989  d_handle = curl::init(target_url,request_headers,NULL);
990  if (!d_handle)
991  throw BESInternalError(string("ERROR! Failed to acquire cURL Easy Handle! "), __FILE__, __LINE__);
992 
993  // Error Buffer (for use during this setup) --------------------------------------------------------------------
994  set_error_buffer(d_handle,errbuf);
995 
996  // Pass all data to the 'write_data' function ------------------------------------------------------------------
997  res = curl_easy_setopt(d_handle, CURLOPT_WRITEFUNCTION, c_write_data);
998  check_setopt_result(res, prolog, "CURLOPT_WRITEFUNCTION", errbuf, __FILE__, __LINE__);
999 
1000  // Pass this to write_data as the fourth argument --------------------------------------------------------------
1001  res = curl_easy_setopt(d_handle, CURLOPT_WRITEDATA, reinterpret_cast<void *>(response_buff));
1002  check_setopt_result(res, prolog, "CURLOPT_WRITEDATA", errbuf, __FILE__, __LINE__);
1003 
1004 #if 0
1005  // handled by curl::init() - SBL 9.10.20
1006  // Follow redirects --------------------------------------------------------------------------------------------
1007  res = curl_easy_setopt(d_handle, CURLOPT_FOLLOWLOCATION, 1L);
1008  check_setopt_result(res, prolog, "CURLOPT_FOLLOWLOCATION", errbuf, __FILE__, __LINE__);
1009 
1010  // Use cookies -------------------------------------------------------------------------------------------------
1011  res = curl_easy_setopt(d_handle, CURLOPT_COOKIEFILE, cookies_file.c_str());
1012  check_setopt_result(res, prolog, "CURLOPT_COOKIEFILE", errbuf, __FILE__, __LINE__);
1013 
1014  res = curl_easy_setopt(d_handle, CURLOPT_COOKIEJAR, cookies_file.c_str());
1015  check_setopt_result(res, prolog, "CURLOPT_COOKIEJAR", errbuf, __FILE__, __LINE__);
1016 
1017  // Authenticate using best available ---------------------------------------------------------------------------
1018  res = curl_easy_setopt(d_handle, CURLOPT_HTTPAUTH, (long) CURLAUTH_ANY);
1019  check_setopt_result(res, prolog, "CURLOPT_HTTPAUTH", errbuf, __FILE__, __LINE__);
1020 
1021  // Use .netrc for credentials ----------------------------------------------------------------------------------
1022  res = curl_easy_setopt(d_handle, CURLOPT_NETRC, CURL_NETRC_OPTIONAL);
1023  check_setopt_result(res, prolog, "CURLOPT_NETRC", errbuf, __FILE__, __LINE__);
1024 
1025  // If the configuration specifies a particular .netrc credentials file, use it. --------------------------------
1026  string netrc_file = get_netrc_filename();
1027  if (!netrc_file.empty()) {
1028  res = curl_easy_setopt(d_handle, CURLOPT_NETRC_FILE, netrc_file.c_str());
1029  check_setopt_result(res, prolog, "CURLOPT_NETRC_FILE", errbuf, __FILE__, __LINE__);
1030  }
1031 
1032  VERBOSE(__FILE__ << "::get_easy_handle() is using the netrc file '"
1033  << ((!netrc_file.empty()) ? netrc_file : "~/.netrc") << "'" << endl);
1034 #endif
1035 
1036  unset_error_buffer(d_handle);
1037 
1038  return d_handle;
1039 }
1040 #endif
1041 
1061 void super_easy_perform(CURL *c_handle) {
1062  unsigned int attempts = 0;
1063  useconds_t retry_time = uone_second / 4;
1064  bool success;
1065  CURLcode curl_code;
1066  char curlErrorBuf[CURL_ERROR_SIZE];
1067  string target_url;
1068 
1069  string empty_str;
1070  target_url = get_effective_url(c_handle, empty_str);
1071  // We check the value of target_url to see if the URL was correctly set in the cURL handle.
1072  if (target_url.empty())
1073  throw BESInternalError("URL acquisition failed.", __FILE__, __LINE__);
1074 
1075  // SET Error Buffer --------------------------------------------------------------------------------------------
1076  set_error_buffer(c_handle, curlErrorBuf);
1077  do {
1078  curlErrorBuf[0] = 0; // Initialize to empty string
1079  ++attempts;
1080  BESDEBUG(MODULE, prolog << "Requesting URL: " << target_url << " attempt: " << attempts << endl);
1081 
1082  curl_code = curl_easy_perform(c_handle);
1083  success = eval_curl_easy_perform_code(c_handle, target_url, curl_code, curlErrorBuf, attempts);
1084  if (success) {
1085  // Nothing obvious went wrong with the curl_easy_perform() so now we check the HTTP stuff
1086  success = eval_http_get_response(c_handle, curlErrorBuf, target_url);
1087  }
1088  // If the curl_easy_perform failed, or if the http request failed then
1089  // we keep trying until we have exceeded the retry_limit.
1090  if (!success) {
1091  if (attempts == retry_limit) {
1092  string msg = prolog + "ERROR - Problem with data transfer. Number of re-tries exceeded. Giving up.";
1093  ERROR_LOG(msg << endl);
1094  throw BESInternalError(msg, __FILE__, __LINE__);
1095  }
1096  else {
1097  ERROR_LOG(prolog << "ERROR - Problem with data transfer. Will retry (url: " << target_url <<
1098  " attempt: " << attempts << ")." << endl);
1099  usleep(retry_time);
1100  retry_time *= 2;
1101  }
1102  }
1103  } while (!success);
1104  // Unset the buffer as it goes out of scope
1105  unset_error_buffer(c_handle);
1106 }
1107 
1108 #if 0
1109 
1114 void read_data(CURL *c_handle) {
1115 
1116  unsigned int attempts = 0;
1117  useconds_t retry_time = uone_second / 4;
1118  bool success;
1119  CURLcode curl_code;
1120  char curlErrorBuf[CURL_ERROR_SIZE];
1121  char *urlp = NULL;
1122 
1123  curl_easy_getinfo(c_handle, CURLINFO_EFFECTIVE_URL, &urlp);
1124  // Checking the curl_easy_getinfo return value in this case is pointless. If it's CURLE_OK then we
1125  // still have to check the value of urlp to see if the URL was correctly set in the
1126  // cURL handle. If it fails then it fails, and urlp is not set. If we just focus on the value of urlp then
1127  // we can just check the one thing.
1128  if (!urlp)
1129  throw BESInternalError("URL acquisition failed.", __FILE__, __LINE__);
1130 
1131  // SET Error Buffer --------------------------------------------------------------------------------------------
1132  set_error_buffer(c_handle, curlErrorBuf);
1133  do {
1134  // bool do_retry;
1135  curlErrorBuf[0]=0; // Initialize to empty string
1136  ++attempts;
1137  BESDEBUG(MODULE, prolog << "Requesting URL: " << urlp << " attempt: " << attempts << endl);
1138 
1139  curl_code = curl_easy_perform(c_handle);
1140  success = eval_curl_easy_perform_code(c_handle, urlp, curl_code, curlErrorBuf, attempts);
1141  if(success){
1142  // Nothing obvious went wrong with the curl_easy_perfom() so now we check the HTTP stuff
1143  success = eval_http_get_response(c_handle, urlp);
1144  }
1145  // If the curl_easy_perform failed, or if the http request failed then
1146  // we keep trying until we have exceeded the retry_limit.
1147  if (!success) {
1148  if (attempts == retry_limit) {
1149  string msg = prolog + "ERROR - Problem with data transfer. Number of re-tries exceeded. Giving up.";
1150  LOG(msg << endl);
1151  throw BESInternalError(msg, __FILE__, __LINE__);
1152  }
1153  else {
1154  LOG(prolog << "ERROR - Problem with data transfer. Will retry (url: " << urlp <<
1155  " attempt: " << attempts << ")." << endl);
1156  usleep(retry_time);
1157  retry_time *= 2;
1158  }
1159  }
1160  } while (!success);
1161 
1162 #if 0
1163  // Try until retry_limit or success...
1164  do {
1165  curlErrorBuf[0] = 0; // clear the error buffer with a null termination at index 0.
1166  curl_code = curl_easy_perform(c_handle); // Do the thing...
1167  ++tries;
1168 
1169  if (CURLE_OK != curl_code) { // Failure here is not an HTTP error, but a cURL error.
1170  throw BESInternalError(
1171  string("read_data() - ERROR! Message: ").append(error_message(curl_code, curlErrorBuf)),
1172  __FILE__, __LINE__);
1173  }
1174 
1175  success = eval_get_response(c_handle, urlp);
1176  // if(debug) cout << ngap_curl::probe_easy_handle(c_handle) << endl;
1177  if (!success) {
1178  if (tries == retry_limit) {
1179  string msg = prolog + "Data transfer error: Number of re-tries exceeded: "+ error_message(curl_code, curlErrorBuf);
1180  LOG(msg << endl);
1181  throw BESInternalError(msg, __FILE__, __LINE__);
1182  }
1183  else {
1184  if (BESDebug::IsSet(MODULE)) {
1185  stringstream ss;
1186  ss << "HTTP transfer 500 error, will retry (trial " << tries << " for: " << urlp << ").";
1187  BESDEBUG(MODULE, ss.str());
1188  }
1189  usleep(retry_time);
1190  retry_time *= 2;
1191  }
1192  }
1193 
1194  } while (!success);
1195 #endif
1196  unset_error_buffer(c_handle);
1197 }
1198 #endif
1199 
1200 string get_cookie_file_base() {
1201  bool found = false;
1202  string cookie_filename;
1203  TheBESKeys::TheKeys()->get_value(HTTP_COOKIES_FILE_KEY, cookie_filename, found);
1204  if (!found) {
1205  cookie_filename = HTTP_DEFAULT_COOKIES_FILE;
1206  }
1207  return cookie_filename;
1208 }
1209 
1210 string get_cookie_filename() {
1211  string cookie_file_base = get_cookie_file_base();
1212  stringstream cf_with_pid;
1213  cf_with_pid << cookie_file_base << "-" << getpid();
1214  return cf_with_pid.str();
1215 }
1216 
1217 void clear_cookies() {
1218  string cf = get_cookie_filename();
1219  int ret = unlink(cf.c_str());
1220  if (ret) {
1221  string msg = prolog + "Failed to unlink the cookie file: " + cf;
1222  ERROR_LOG(msg << endl);
1223  BESDEBUG(MODULE, prolog << msg << endl);
1224  }
1225 }
1226 
1227 
1235 bool is_retryable(std::string target_url) {
1236  BESDEBUG(MODULE, prolog << "BEGIN" << endl);
1237  bool retryable = true;
1238 
1239  vector<string> nr_regexs;
1240  bool found;
1241  TheBESKeys::TheKeys()->get_values(HTTP_NO_RETRY_URL_REGEX_KEY, nr_regexs, found);
1242  if (found) {
1243  vector<string>::iterator it;
1244  for (it = nr_regexs.begin(); it != nr_regexs.end() && retryable; it++) {
1245  BESRegex no_retry_regex((*it).c_str(), (*it).size());
1246  size_t match_length;
1247  match_length = no_retry_regex.match(target_url.c_str(), target_url.size(), 0);
1248  if (match_length == target_url.size()) {
1249  BESDEBUG(MODULE, prolog << "The url: '" << target_url << "' fully matched the "
1250  << HTTP_NO_RETRY_URL_REGEX_KEY << ": '" << *it << "'" << endl);
1251  retryable = false;
1252  }
1253  }
1254  }
1255  BESDEBUG(MODULE, prolog << "END retryable: " << (retryable ? "true" : "false") << endl);
1256  return retryable;
1257 }
1258 
1293 bool eval_http_get_response(CURL *ceh, char *error_buffer, const string &requested_url) {
1294  BESDEBUG(MODULE, prolog << "Requested URL: " << requested_url << endl);
1295  CURLcode curl_code;
1296  string last_accessed_url = get_effective_url(ceh, requested_url);
1297  BESDEBUG(MODULE, prolog << "Last Accessed URL(CURLINFO_EFFECTIVE_URL): " << last_accessed_url << endl);
1298 
1299  long http_code = 0;
1300 
1301  curl_code = curl_easy_getinfo(ceh, CURLINFO_RESPONSE_CODE, &http_code);
1302  if (curl_code == CURLE_GOT_NOTHING) {
1303  // First we check to see if the response was empty. This is a cURL error, not an HTTP error
1304  // so we have to handle it like this. And we do that because this is one of the failure modes
1305  // we see in the AWS cloud and by trapping this and returning false we are able to be resilient and retry.
1306  stringstream msg;
1307  msg << prolog << "ERROR - cURL returned CURLE_GOT_NOTHING. Message: '";
1308  msg << error_message(curl_code, error_buffer) << "' ";
1309  msg << "CURLINFO_EFFECTIVE_URL: " << last_accessed_url << " ";
1310  msg << "A retry may be possible for: " << requested_url << ")." << endl;
1311  BESDEBUG(MODULE, msg.str());
1312  ERROR_LOG(msg.str());
1313  return false;
1314  }
1315  else if (curl_code != CURLE_OK) {
1316  // Not an error we are trapping so it's fail time.
1317  throw BESInternalError(
1318  string("Error acquiring HTTP response code: ").append(curl::error_message(curl_code, error_buffer)),
1319  __FILE__, __LINE__);
1320  }
1321 
1322  if (BESDebug::IsSet(MODULE)) {
1323  long redirects;
1324  curl_easy_getinfo(ceh, CURLINFO_REDIRECT_COUNT, &redirects);
1325  BESDEBUG(MODULE, prolog << "CURLINFO_REDIRECT_COUNT: " << redirects << endl);
1326 
1327  char *redirect_url = NULL;
1328  curl_easy_getinfo(ceh, CURLINFO_REDIRECT_URL, &redirect_url);
1329  if (redirect_url)
1330  BESDEBUG(MODULE, prolog << "CURLINFO_REDIRECT_URL: " << redirect_url << endl);
1331  }
1332 
1333  stringstream msg;
1334  if (http_code >= 400) {
1335  msg << "ERROR - The HTTP GET request for the source URL: " << requested_url << " FAILED. ";
1336  msg << "CURLINFO_EFFECTIVE_URL: " << last_accessed_url << " ";
1337  BESDEBUG(MODULE, prolog << msg.str() << endl);
1338  }
1339  msg << "The response had an HTTP status of " << http_code;
1340  msg << " which means '" << http_status_to_string(http_code) << "'";
1341 
1342  // Newer Apache servers return 206 for range requests. jhrg 8/8/18
1343  switch (http_code) {
1344  case 200: // OK
1345  case 206: // Partial content - this is to be expected since we use range gets
1346  // cases 201-205 are things we should probably reject, unless we add more
1347  // comprehensive HTTP/S processing here. jhrg 8/8/18
1348  return true;
1349 
1350  case 400: // Bad Request
1351  ERROR_LOG(msg.str() << endl);
1352  throw BESSyntaxUserError(msg.str(), __FILE__, __LINE__);
1353 
1354  case 401: // Unauthorized
1355  case 402: // Payment Required
1356  case 403: // Forbidden
1357  ERROR_LOG(msg.str() << endl);
1358  throw BESForbiddenError(msg.str(), __FILE__, __LINE__);
1359 
1360  case 404: // Not Found
1361  ERROR_LOG(msg.str() << endl);
1362  throw BESNotFoundError(msg.str(), __FILE__, __LINE__);
1363 
1364  case 408: // Request Timeout
1365  ERROR_LOG(msg.str() << endl);
1366  throw BESTimeoutError(msg.str(), __FILE__, __LINE__);
1367 
1368  case 422: // Unprocessable Entity
1369  case 500: // Internal server error
1370  case 502: // Bad Gateway
1371  case 503: // Service Unavailable
1372  case 504: // Gateway Timeout
1373  {
1374  if (!is_retryable(last_accessed_url)) {
1375  msg << " The semantics of this particular last accessed URL indicate that it should not be retried.";
1376  ERROR_LOG(msg.str() << endl);
1377  throw BESInternalError(msg.str(), __FILE__, __LINE__);
1378  }
1379  return false;
1380  }
1381 
1382  default: {
1383  ERROR_LOG(msg.str() << endl);
1384  throw BESInternalError(msg.str(), __FILE__, __LINE__);
1385  }
1386  }
1387 }
1388 
1389 
1410 bool eval_curl_easy_perform_code(
1411  CURL *ceh,
1412  const string requested_url,
1413  CURLcode curl_code,
1414  char *error_buffer,
1415  const unsigned int attempt
1416 ) {
1417  bool success = true;
1418  string last_accessed_url = get_effective_url(ceh, requested_url);
1419  if (curl_code == CURLE_SSL_CONNECT_ERROR) {
1420  stringstream msg;
1421  msg << prolog << "ERROR - cURL experienced a CURLE_SSL_CONNECT_ERROR error. Message: '";
1422  msg << error_message(curl_code, error_buffer) << "' ";
1423  msg << "CURLINFO_EFFECTIVE_URL: " << last_accessed_url << " ";
1424  msg << "A retry may be possible for: " << requested_url << " (attempt: " << attempt << ")." << endl;
1425  BESDEBUG(MODULE, msg.str());
1426  ERROR_LOG(msg.str());
1427  success = false;
1428  }
1429  else if (curl_code == CURLE_SSL_CACERT_BADFILE) {
1430  stringstream msg;
1431  msg << prolog << "ERROR - cURL experienced a CURLE_SSL_CACERT_BADFILE error. Message: '";
1432  msg << error_message(curl_code, error_buffer) << "' ";
1433  msg << "CURLINFO_EFFECTIVE_URL: " << last_accessed_url << " ";
1434  msg << "A retry may be possible for: " << requested_url << " (attempt: " << attempt << ")." << endl;
1435  BESDEBUG(MODULE, msg.str());
1436  ERROR_LOG(msg.str());
1437  success = false;
1438  }
1439  else if (curl_code == CURLE_GOT_NOTHING) {
1440  // First we check to see if the response was empty. This is a cURL error, not an HTTP error
1441  // so we have to handle it like this. And we do that because this is one of the failure modes
1442  // we see in the AWS cloud and by trapping this and returning false we are able to be resilient and retry.
1443  stringstream msg;
1444  msg << prolog << "ERROR - cURL returned CURLE_GOT_NOTHING. Message: ";
1445  msg << error_message(curl_code, error_buffer) << "' ";
1446  msg << "CURLINFO_EFFECTIVE_URL: " << last_accessed_url << " ";
1447  msg << "A retry may be possible for: " << requested_url << " (attempt: " << attempt << ")." << endl;
1448  BESDEBUG(MODULE, msg.str());
1449  ERROR_LOG(msg.str());
1450  return false;
1451  }
1452  else if (CURLE_OK != curl_code) {
1453  stringstream msg;
1454  msg << "ERROR - Problem with data transfer. Message: " << error_message(curl_code, error_buffer);
1455  string effective_url = get_effective_url(ceh, requested_url);
1456  msg << " CURLINFO_EFFECTIVE_URL: " << effective_url;
1457  BESDEBUG(MODULE, prolog << msg.str() << endl);
1458  ERROR_LOG(msg.str() << endl);
1459  throw BESInternalError(msg.str(), __FILE__, __LINE__);
1460  }
1461  return success;
1462 }
1463 
1464 #if 0
1472  void retrieve_effective_url(const string &target_url, string &last_accessed_url) {
1473  vector<string> resp_hdrs;
1474  CURL *ceh = NULL;
1475  // CURLcode curl_code;
1476  curl_slist *request_headers = NULL;
1477 
1478  BESDEBUG(MODULE, prolog << "BEGIN" << endl);
1479 
1480  // Add the authorization headers
1481  request_headers = add_auth_headers(request_headers);
1482 
1483  try {
1484  BESDEBUG(MODULE,
1485  prolog << "BESDebug::IsSet(" << MODULE << "): " << (BESDebug::IsSet(MODULE) ? "true" : "false")
1486  << endl);
1487  BESDEBUG(MODULE, prolog << "BESDebug::IsSet(" << TIMING_LOG_KEY << "): "
1488  << (BESDebug::IsSet(TIMING_LOG_KEY) ? "true" : "false") << endl);
1489  BESDEBUG(MODULE,
1490  prolog << "BESLog::TheLog()->is_verbose(): " << (BESLog::TheLog()->is_verbose() ? "true" : "false")
1491  << endl);
1492 
1493  ceh = init_effective_url_retriever_handle(target_url, request_headers, resp_hdrs);
1494 
1495  {
1496  BESStopWatch sw;
1497  if (BESDebug::IsSet("euc") || BESDebug::IsSet(MODULE) || BESDebug::IsSet(TIMING_LOG_KEY) ||
1498  BESLog::TheLog()->is_verbose()) {
1499  sw.start(prolog + " Following Redirects Starting With: " + target_url);
1500  }
1501  super_easy_perform(ceh);
1502  }
1503 
1504  // After doing the thing with super_easy_perform() we retrieve the effective URL form the cURL handle.
1505  last_accessed_url = get_effective_url(ceh, target_url);
1506  BESDEBUG(MODULE, prolog << "Last Accessed URL(CURLINFO_EFFECTIVE_URL): " << last_accessed_url << endl);
1507  INFO_LOG(
1508  prolog << "Source URL: '" << target_url << "' CURLINFO_EFFECTIVE_URL: '" << last_accessed_url << "'"
1509  << endl);
1510 
1511  if (request_headers)
1512  curl_slist_free_all(request_headers);
1513  if (ceh)
1514  curl_easy_cleanup(ceh);
1515  }
1516  catch (...) {
1517  if (request_headers)
1518  curl_slist_free_all(request_headers);
1519  if (ceh)
1520  curl_easy_cleanup(ceh);
1521  throw;
1522  }
1523  }
1524 #endif
1525 
1534  http::EffectiveUrl *retrieve_effective_url(const string &target_url) {
1535 
1536  vector<string> resp_hdrs;
1537  CURL *ceh = NULL;
1538  // CURLcode curl_code;
1539  curl_slist *request_headers = NULL;
1540 
1541  BESDEBUG(MODULE, prolog << "BEGIN" << endl);
1542 
1543  // Add the authorization headers
1544  request_headers = add_auth_headers(request_headers);
1545 
1546  try {
1547  BESDEBUG(MODULE,
1548  prolog << "BESDebug::IsSet(" << MODULE << "): " << (BESDebug::IsSet(MODULE) ? "true" : "false")
1549  << endl);
1550  BESDEBUG(MODULE, prolog << "BESDebug::IsSet(" << TIMING_LOG_KEY << "): "
1551  << (BESDebug::IsSet(TIMING_LOG_KEY) ? "true" : "false") << endl);
1552  BESDEBUG(MODULE, prolog << "BESLog::TheLog()->is_verbose(): "
1553  << (BESLog::TheLog()->is_verbose() ? "true" : "false") << endl);
1554 
1555  ceh = init_effective_url_retriever_handle(target_url, request_headers, resp_hdrs);
1556 
1557  {
1558  BESStopWatch sw;
1559  if (BESDebug::IsSet("euc") || BESDebug::IsSet(MODULE) || BESDebug::IsSet(TIMING_LOG_KEY) ||
1560  BESLog::TheLog()->is_verbose()) {
1561  sw.start(prolog + " Following Redirects Starting With: " + target_url);
1562  }
1563  super_easy_perform(ceh);
1564  }
1565 
1566  // After doing the thing with super_easy_perform() we retrieve the effective URL form the cURL handle.
1567  string effective_url_str = get_effective_url(ceh, target_url);
1568  BESDEBUG(MODULE, prolog << "Last Accessed URL(CURLINFO_EFFECTIVE_URL): " << effective_url_str << endl);
1569  INFO_LOG(prolog << "Source URL: '" << target_url << "' CURLINFO_EFFECTIVE_URL: '" << effective_url_str
1570  << "'"
1571  << endl);
1572 
1573  auto *eurl = new EffectiveUrl(effective_url_str, resp_hdrs);
1574 
1575  if (request_headers)
1576  curl_slist_free_all(request_headers);
1577  if (ceh)
1578  curl_easy_cleanup(ceh);
1579 
1580  return eurl;
1581  }
1582  catch (...) {
1583  if (request_headers)
1584  curl_slist_free_all(request_headers);
1585  if (ceh)
1586  curl_easy_cleanup(ceh);
1587  throw;
1588  }
1589 
1590 #if 0
1591  {
1592  unsigned int attempts = 0;
1593  bool success = true;
1594  useconds_t retry_time = uone_second / 4;
1595 
1596  char error_buffer[CURL_ERROR_SIZE];
1597  vector<string> resp_hdrs;
1598  CURL *ceh = NULL;
1599  CURLcode curl_code;
1600 
1601  struct curl_slist *request_headers = NULL;
1602  // Add the authorization headers
1603  request_headers = get_auth_headers(request_headers);
1604 
1605  try {
1606  ceh = init_effective_url_retriever_handle(url, request_headers, resp_hdrs);
1607  set_error_buffer(ceh, error_buffer);
1608  do {
1609  // bool do_retry;
1610  error_buffer[0] = 0; // Initialize to empty string
1611  ++attempts;
1612  BESDEBUG(MODULE, prolog << "Requesting URL: " << target_url << " attempt: " << attempts << endl);
1613 
1614  curl_code = curl_easy_perform(ceh);
1615  success = eval_curl_easy_perform_code(ceh, target_url, curl_code, error_buffer, attempts);
1616  if (success) {
1617  // Nothing obvious went wrong with the curl_easy_perfom() so now we check the HTTP stuff
1618  success = eval_http_get_response(ceh, target_url);
1619  if (!success) {
1620  if (attempts == retry_limit) {
1621  string msg = prolog +
1622  "ERROR - Problem with data transfer. Number of re-tries exceeded. Giving up.";
1623  LOG(msg << endl);
1624  throw BESInternalError(msg, __FILE__, __LINE__);
1625  } else {
1626  LOG(prolog << "ERROR - Problem with data transfer. Will retry (url: " << target_url <<
1627  " attempt: " << attempts << ")." << endl);
1628  }
1629  }
1630  }
1631  // If it did not work we keep trying until we have exceeded the retry_limit.
1632  if (!success) {
1633  usleep(retry_time);
1634  retry_time *= 2;
1635  }
1636  } while (!success);
1637 
1638  char *effective_url = 0;
1639  curl_easy_getinfo(ceh, CURLINFO_EFFECTIVE_URL, &effective_url);
1640  BESDEBUG(MODULE, prolog << " CURLINFO_EFFECTIVE_URL: " << effective_url << endl);
1641  last_accessed_url = effective_url;
1642 
1643  LOG(prolog << "Source URL: '" << target_url << "' Last Accessed URL: '" << last_accessed_url << "'" << endl);
1644 
1645  unset_error_buffer(ceh);
1646 
1647  if (ceh) {
1648  curl_slist_free_all(request_headers);
1649  curl_easy_cleanup(ceh);
1650  ceh = 0;
1651  }
1652  }
1653  catch (...) {
1654  if (request_headers)
1655  curl_slist_free_all(request_headers);
1656  if (ceh) {
1657  curl_easy_cleanup(ceh);
1658  ceh = 0;
1659  }
1660  throw;
1661  }
1662  }
1663 #endif
1664  }
1665 
1675 string get_netrc_filename() {
1676  string netrc_filename;
1677  bool found = false;
1678  TheBESKeys::TheKeys()->get_value(HTTP_NETRC_FILE_KEY, netrc_filename, found);
1679  if (found) {
1680  BESDEBUG(MODULE, prolog << "Using netrc file: " << netrc_filename << endl);
1681  }
1682  else {
1683  BESDEBUG(MODULE, prolog << "Using default netrc file. (~/.netrc)" << endl);
1684  }
1685  return netrc_filename;
1686 }
1687 
1693 void set_error_buffer(CURL *ceh, char *error_buffer) {
1694  CURLcode res;
1695  res = curl_easy_setopt(ceh, CURLOPT_ERRORBUFFER, error_buffer);
1696  curl::eval_curl_easy_setopt_result(res, prolog, "CURLOPT_ERRORBUFFER", error_buffer, __FILE__, __LINE__);
1697 }
1698 
1704 void unset_error_buffer(CURL *ceh) {
1705  set_error_buffer(ceh, NULL);
1706 }
1707 
1708 
1713 string hyrax_user_agent() {
1714  // return curl_version();
1715  return "Hyrax";
1716 }
1717 
1733 void eval_curl_easy_setopt_result(
1734  CURLcode curl_code,
1735  string msg_base,
1736  string opt_name,
1737  char *ebuf,
1738  string file,
1739  unsigned int line) {
1740  if (curl_code != CURLE_OK) {
1741  stringstream msg;
1742  msg << msg_base << "ERROR - cURL failed to set " << opt_name << " Message: " << curl::error_message(curl_code, ebuf);
1743  throw BESInternalError(msg.str(), file, line);
1744  }
1745 }
1746 
1747 unsigned long max_redirects() {
1749 }
1750 
1762 curl_slist *append_http_header(curl_slist *slist, const string &header_name, const string &value)
1763 {
1764 
1765  string full_header = header_name;
1766  full_header.append(": ").append(value);
1767 
1768  BESDEBUG(MODULE, prolog << full_header << endl);
1769  // std::cerr << prolog << full_header << endl;
1770 
1771  struct curl_slist *temp = curl_slist_append(slist, full_header.c_str());
1772  if (!temp){
1773  stringstream msg;
1774  msg << prolog << "Encountered cURL Error setting the " << header_name << " header. full_header: " << full_header;
1775  throw BESInternalError(msg.str(), __FILE__, __LINE__);
1776  }
1777  return temp;
1778 }
1779 
1780 
1811 curl_slist *add_auth_headers(curl_slist *request_headers) {
1812  bool found;
1813  string s;
1814 
1815  s = BESContextManager::TheManager()->get_context(EDL_UID_KEY, found);
1816  if (found && !s.empty()) {
1817  request_headers = append_http_header(request_headers,"User-Id",s);
1818  }
1819 
1820  s = BESContextManager::TheManager()->get_context(EDL_AUTH_TOKEN_KEY, found);
1821  if (found && !s.empty()) {
1822  request_headers = append_http_header(request_headers,"Authorization",s);
1823  }
1824 
1825  s = BESContextManager::TheManager()->get_context(EDL_ECHO_TOKEN_KEY, found);
1826  if (found && !s.empty()) {
1827  request_headers = append_http_header(request_headers,"Echo-Token",s);
1828  }
1829 
1830  return request_headers;
1831 }
1832 
1840 string get_effective_url(CURL *ceh, string requested_url) {
1841  char *effectve_url = NULL;
1842  CURLcode curl_code = curl_easy_getinfo(ceh, CURLINFO_EFFECTIVE_URL, &effectve_url);
1843  if (curl_code != CURLE_OK) {
1844  stringstream msg;
1845  msg << prolog << "Unable to determine CURLINFO_EFFECTIVE_URL! Requested URL: " << requested_url;
1846  BESDEBUG(MODULE, msg.str() << endl);
1847  throw BESInternalError(msg.str(), __FILE__, __LINE__);
1848  }
1849  return effectve_url;
1850 }
1851 
1852 
1853 } /* namespace curl */
virtual std::string get_context(const std::string &name, bool &found)
retrieve the value of the specified context from the BES
static bool IsSet(const std::string &flagName)
see if the debug context flagName is set to true
Definition: BESDebug.h:160
error thrown if the BES is not allowed to access the resource requested
exception thrown if internal error encountered
error thrown if the resource requested cannot be found
virtual bool start(std::string name)
Definition: BESStopWatch.cc:67
error thrown if there is a user syntax error in the request or any other user error
error thrown if there is a user syntax error in the request or any other user error
void get_value(const std::string &s, std::string &val, bool &found)
Retrieve the value of a given key, if set.
Definition: TheBESKeys.cc:339
static TheBESKeys * TheKeys()
Definition: TheBESKeys.cc:71
void get_values(const std::string &s, std::vector< std::string > &vals, bool &found)
Retrieve the values of a given key, if set.
Definition: TheBESKeys.cc:370
static AllowedHosts * theHosts()
Static accessor for the singleton.
Definition: AllowedHosts.cc:58
GenericDocument< UTF8<> > Document
GenericDocument with UTF8 encoding.
Definition: document.h:2585
utility class for the HTTP catalog module
Definition: EffectiveUrl.cc:58
size_t load_max_redirects_from_keys()
Definition: HttpUtils.cc:183