bes  Updated for version 3.20.8
HttpdDirScraper.cc
1 // -*- mode: c++; c-basic-offset:4 -*-
2 //
3 // This file is part of httpd_catalog_module, A C++ module that can be loaded in to
4 // the OPeNDAP Back-End Server (BES) and is able to handle remote requests.
5 //
6 // Copyright (c) 2018 OPeNDAP, Inc.
7 // Author: Nathan Potter <ndp@opendap.org>
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 //
23 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
24 
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <stdlib.h> /* atol */
29 #include <ctype.h> /* isalpha and isdigit */
30 #include <time.h> /* mktime */
31 
32 #include <BESDebug.h>
33 #include <BESUtil.h>
34 #include <BESRegex.h>
35 #include <BESCatalogList.h>
36 #include <BESCatalogUtils.h>
37 #include <CatalogItem.h>
38 
39 #include "RemoteResource.h"
40 #include "HttpdCatalogNames.h"
41 
42 #include "HttpdDirScraper.h"
43 
44 using namespace std;
45 using bes::CatalogItem;
46 
47 #define prolog std::string("HttpdDirScraper::").append(__func__).append("() - ")
48 
49 namespace httpd_catalog {
50 
51 HttpdDirScraper::HttpdDirScraper()
52 {
53  // There was probably a better way to make this association but this worked.
54  d_months.insert(pair<string, int>(string("jan"), 0));
55  d_months.insert(pair<string, int>(string("feb"), 1));
56  d_months.insert(pair<string, int>(string("mar"), 2));
57  d_months.insert(pair<string, int>(string("apr"), 3));
58  d_months.insert(pair<string, int>(string("may"), 4));
59  d_months.insert(pair<string, int>(string("jun"), 5));
60  d_months.insert(pair<string, int>(string("jul"), 6));
61  d_months.insert(pair<string, int>(string("aug"), 7));
62  d_months.insert(pair<string, int>(string("sep"), 8));
63  d_months.insert(pair<string, int>(string("oct"), 9));
64  d_months.insert(pair<string, int>(string("nov"), 10));
65  d_months.insert(pair<string, int>(string("dec"), 11));
66 }
67 
68 /*
69  * @brief Converts an Apache httpd directory page "size" string (23K, 45M, 32G, etc)
70  * to an actual value, approximate though it may be.
71  */
72 long HttpdDirScraper::get_size_val(const string size_str) const
73 {
74  char scale_c = *size_str.rbegin();
75  long scale = 1;
76 
77  switch (scale_c) {
78  case 'K':
79  scale = 1e3;
80  break;
81  case 'M':
82  scale = 1e6;
83  break;
84  case 'G':
85  scale = 1e9;
86  break;
87  case 'T':
88  scale = 1e12;
89  break;
90  case 'P':
91  scale = 1e15;
92  break;
93  default:
94  scale = 1;
95  break;
96  }
97  BESDEBUG(MODULE, prolog << "scale: " << scale << endl);
98 
99  string result = size_str;
100  if (isalpha(scale_c)) result = size_str.substr(0, size_str.length() - 1);
101 
102  long size = atol(result.c_str());
103  BESDEBUG(MODULE, prolog << "raw size: " << size << endl);
104 
105  size *= scale;
106  BESDEBUG(MODULE, prolog << "scaled size: " << size << endl);
107  return size;
108 }
109 
113 string show_tm_struct(const tm tms)
114 {
115  stringstream ss;
116  ss << "tm_sec: " << tms.tm_sec << endl;
117  ss << "tm_min: " << tms.tm_min << endl;
118  ss << "tm_hour: " << tms.tm_hour << endl;
119  ss << "tm_mday: " << tms.tm_mday << endl;
120  ss << "tm_mon: " << tms.tm_mon << endl;
121  ss << "tm_year: " << tms.tm_year << endl;
122  ss << "tm_wday: " << tms.tm_wday << endl;
123  ss << "tm_yday: " << tms.tm_yday << endl;
124  ss << "tm_isdst: " << tms.tm_isdst << endl;
125  return ss.str();
126 }
127 
131 void zero_tm_struct(tm &tms)
132 {
133  tms.tm_sec = 0;
134  tms.tm_min = 0;
135  tms.tm_hour = 0;
136  tms.tm_mday = 1;
137  tms.tm_mon = 0;
138  tms.tm_year = 0;
139  tms.tm_wday = 0;
140  tms.tm_yday = 0;
141  tms.tm_isdst = 0;
142 }
143 
144 
145 string HttpdDirScraper::httpd_time_to_iso_8601(const string httpd_time) const
146 {
147  vector<string> tokens;
148  string delimiters = "- :";
149  BESUtil::tokenize(httpd_time, tokens, delimiters);
150 
151  BESDEBUG(MODULE, prolog << "Found " << tokens.size() << " tokens." << endl);
152  vector<string>::iterator it = tokens.begin();
153  int i = 0;
154  if (BESDebug::IsSet(MODULE)) {
155  while (it != tokens.end()) {
156  BESDEBUG(MODULE, prolog << " token["<< i++ << "]: "<< *it << endl);
157  it++;
158  }
159  }
160 
161  BESDEBUG(MODULE, prolog << "Second Field: "<< tokens[1] << endl);
162 
163  const char *second_field = tokens[1].c_str();
164  bool is_alpha = true;
165  for(unsigned long i=0; is_alpha && i< tokens[1].length(); i++){
166  is_alpha = isalpha(second_field[i]);
167  }
168  time_t theTime;
169  if(is_alpha){
170  BESDEBUG(MODULE, prolog << "Detected Time Format A (\"DD-MM-YYY hh:mm\")" << endl);
171  theTime = parse_time_format_A(tokens);
172  }
173  else {
174  BESDEBUG(MODULE, prolog << "Detected Time Format B (\"YYYY-MM-DD hh:mm\")" << endl);
175  theTime = parse_time_format_B(tokens);
176  }
177  return BESUtil::get_time(theTime, false);
178 
179 }
180 
186 time_t HttpdDirScraper::parse_time_format_A(const vector<string> tokens) const
187 {
188  // void BESUtil::tokenize(const string& str, vector<string>& tokens, const string& delimiters)
189  struct tm tm;
190  zero_tm_struct(tm);
191 
192  if (tokens.size() > 2) {
193  std::istringstream(tokens[0]) >> tm.tm_mday;
194  BESDEBUG(MODULE, prolog << " tm.tm_mday: "<< tm.tm_mday << endl);
195 
196  pair<string, int> mnth = *d_months.find(BESUtil::lowercase(tokens[1]));
197  BESDEBUG(MODULE, prolog << " mnth.first: "<< mnth.first << endl);
198  BESDEBUG(MODULE, prolog << " mnth.second: "<< mnth.second << endl);
199  tm.tm_mon = mnth.second;
200  BESDEBUG(MODULE, prolog << " tm.tm_mon: "<< tm.tm_mon << endl);
201 
202  std::istringstream(tokens[2]) >> tm.tm_year;
203  tm.tm_year -= 1900;
204  BESDEBUG(MODULE, prolog << " tm.tm_year: "<< tm.tm_year << endl);
205 
206  if (tokens.size() > 4) {
207  std::istringstream(tokens[3]) >> tm.tm_hour;
208  BESDEBUG(MODULE, prolog << " tm.tm_hour: "<< tm.tm_hour << endl);
209  std::istringstream(tokens[4]) >> tm.tm_min;
210  BESDEBUG(MODULE, prolog << " tm.tm_min: "<< tm.tm_min << endl);
211  }
212  }
213 
214  BESDEBUG(MODULE, prolog << "tm struct: " << endl << show_tm_struct(tm));
215 
216  time_t theTime = mktime(&tm);
217  BESDEBUG(MODULE, prolog << "theTime: " << theTime << endl);
218  return theTime;
219 }
220 
226 time_t HttpdDirScraper::parse_time_format_B(const vector<string> tokens) const
227 {
228  // void BESUtil::tokenize(const string& str, vector<string>& tokens, const string& delimiters)
229  struct tm tm;
230  zero_tm_struct(tm);
231 
232  if (tokens.size() > 2) {
233  std::istringstream(tokens[0]) >> tm.tm_year;
234  tm.tm_year -= 1900;
235  BESDEBUG(MODULE, prolog << " tm.tm_year: "<< tm.tm_year << endl);
236 
237  std::istringstream(tokens[1]) >> tm.tm_mon;
238  BESDEBUG(MODULE, prolog << " tm.tm_mon: "<< tm.tm_mon << endl);
239 
240  std::istringstream(tokens[2]) >> tm.tm_mday;
241  BESDEBUG(MODULE, prolog << " tm.tm_mday: "<< tm.tm_mday << endl);
242 
243  if (tokens.size() > 4) {
244  std::istringstream(tokens[3]) >> tm.tm_hour;
245  BESDEBUG(MODULE, prolog << " tm.tm_hour: "<< tm.tm_hour << endl);
246  std::istringstream(tokens[4]) >> tm.tm_min;
247  BESDEBUG(MODULE, prolog << " tm.tm_min: "<< tm.tm_min << endl);
248  }
249  }
250 
251  BESDEBUG(MODULE, prolog << "tm struct: " << endl << show_tm_struct(tm));
252 
253  time_t theTime = mktime(&tm);
254  BESDEBUG(MODULE, prolog << "ISO-8601 Time: " << theTime << endl);
255  return theTime;
256 }
257 
274 void HttpdDirScraper::createHttpdDirectoryPageMap(std::string url, std::map<std::string, bes::CatalogItem *> &items) const
275 {
276  const BESCatalogUtils *cat_utils = BESCatalogList::TheCatalogList()->find_catalog(BES_DEFAULT_CATALOG)->get_catalog_utils();
277 
278  // Go get the text from the remote resource
279  http::RemoteResource rhr(url);
280  rhr.retrieveResource();
281  stringstream buffer;
282 
283  ifstream cache_file_is(rhr.getCacheFileName().c_str());
284  if(!cache_file_is.is_open()){
285  string msg = prolog + "ERROR - Failed to open cache file: " + rhr.getCacheFileName();
286  BESDEBUG(MODULE, msg << endl);
287  throw BESInternalError(msg ,__FILE__, __LINE__ );
288  }
289 
290  buffer << cache_file_is.rdbuf();
291  string pageStr = buffer.str();
292  BESDEBUG(MODULE, prolog << "Page Content: " << endl << pageStr << endl);
293 
294  // Does it look like an Apache httpd Index listing?
295  if(pageStr.find("<title>Index of ") == string::npos){
296  // Nope. Time to leave.
297  BESDEBUG(MODULE, prolog << "The url: " << url << " does not appear to reference an Apache httpd Index page." << endl);
298  return;
299  }
300 
301  string aOpenStr = "<a ";
302  string aCloseStr = "</a>";
303  string hrefStr = "href=\"";
304  string tdOpenStr = "<td ";
305  string tdCloseStr = "</td>";
306 
307  BESRegex hrefExcludeRegex("(^#.*$)|(^\\?C.*$)|(redirect\\/)|(^\\/$)|(^<img.*$)");
308  BESRegex nameExcludeRegex("^Parent Directory$");
309 
310  bool done = false;
311  int next_start = 0;
312  while (!done) {
313  int aOpenIndex = pageStr.find(aOpenStr, next_start);
314  if (aOpenIndex < 0) {
315  done = true;
316  }
317  else {
318  int aCloseIndex = pageStr.find(aCloseStr, aOpenIndex + aOpenStr.length());
319  if (aCloseIndex < 0) {
320  done = true;
321  }
322  else {
323  int length;
324 
325  // Locate the entire <a /> element
326  BESDEBUG(MODULE, prolog << "aOpenIndex: " << aOpenIndex << endl);
327  BESDEBUG(MODULE, prolog << "aCloseIndex: " << aCloseIndex << endl);
328  length = aCloseIndex + aCloseStr.length() - aOpenIndex;
329  string aElemStr = pageStr.substr(aOpenIndex, length);
330  BESDEBUG(MODULE, prolog << "Processing link: " << aElemStr << endl);
331 
332  // Find the link text
333  int start = aElemStr.find(">") + 1;
334  int end = aElemStr.find("<", start);
335  length = end - start;
336  string linkText = aElemStr.substr(start, length);
337  BESDEBUG(MODULE, prolog << "Link Text: " << linkText << endl);
338 
339  // Locate the href attribute
340  start = aElemStr.find(hrefStr) + hrefStr.length();
341  end = aElemStr.find("\"", start);
342  length = end - start;
343  string href = aElemStr.substr(start, length);
344  BESDEBUG(MODULE, prolog << "href: " << href << endl);
345 
346  // attempt to get time string
347  string time_str;
348  int start_pos = getNextElementText(pageStr, "td", aCloseIndex + aCloseStr.length(), time_str);
349  BESDEBUG(MODULE, prolog << "time_str: '" << time_str << "'" << endl);
350 
351  // attempt to get size string
352  string size_str;
353  start_pos = getNextElementText(pageStr, "td", start_pos, size_str);
354  BESDEBUG(MODULE, prolog << "size_str: '" << size_str << "'" << endl);
355 
356  if ((linkText.find("<img") != string::npos) || !(linkText.length()) || (linkText.find("<<<") != string::npos)
357  || (linkText.find(">>>") != string::npos)) {
358  BESDEBUG(MODULE, prolog << "SKIPPING(image|copy|<<<|>>>): " << aElemStr << endl);
359  }
360  else {
361  if (href.length() == 0 || (((href.find("http://") == 0) || (href.find("https://") == 0)) && !(href.find(url) == 0))) {
362  // SKIPPING
363  BESDEBUG(MODULE, prolog << "SKIPPING(null or remote): " << href << endl);
364  }
365  else if (hrefExcludeRegex.match(href.c_str(), href.length(), 0) > 0) {
366  // SKIPPING
367  BESDEBUG(MODULE, prolog << "SKIPPING(hrefExcludeRegex) - href: '" << href << "'"<< endl);
368  }
369  else if (nameExcludeRegex.match(linkText.c_str(), linkText.length(), 0) > 0) {
370  // SKIPPING
371  BESDEBUG(MODULE, prolog << "SKIPPING(nameExcludeRegex) - name: '" << linkText << "'" << endl);
372  }
373  else if (BESUtil::endsWith(href, "/")) {
374  string node_name = href.substr(0, href.length() - 1);
375  // it's a directory aka a node
376  BESDEBUG(MODULE, prolog << "NODE: " << node_name << endl);
377  bes::CatalogItem *childNode = new bes::CatalogItem();
378  childNode->set_type(CatalogItem::node);
379  childNode->set_name(node_name);
380  childNode->set_is_data(false);
381  string iso_8601_time = httpd_time_to_iso_8601(time_str);
382  childNode->set_lmt(iso_8601_time);
383  // FIXME: For nodes the size should be the number of children, but how without crawling?
384  long size = get_size_val(size_str);
385  childNode->set_size(size);
386 
387  items.insert(pair<std::string, bes::CatalogItem *>(node_name, childNode));
388  }
389  else {
390  // It's a file aka a leaf
391  BESDEBUG(MODULE, prolog << "LEAF: " << href << endl);
392  CatalogItem *leafItem = new CatalogItem();
393  leafItem->set_type(CatalogItem::leaf);
394  leafItem->set_name(href);
395  leafItem->set_is_data(cat_utils->is_data(href));
396  string iso_8601_time = httpd_time_to_iso_8601(time_str);
397  leafItem->set_lmt(iso_8601_time);
398  long size = get_size_val(size_str);
399  leafItem->set_size(size);
400 
401  items.insert(pair<std::string, bes::CatalogItem *>(href, leafItem));
402  }
403  }
404  }
405  next_start = aCloseIndex + aCloseStr.length();
406  }
407  }
408 }
409 
422 int HttpdDirScraper::getNextElementText(const string &page_str, const string element_name, int startIndex, string &resultText, bool trim) const
423 {
424  string e_open_str = "<" + element_name + " ";
425  string e_close_str = "</" + element_name + ">";
426 
427  // Locate the next "element_name" element
428  int start = page_str.find(e_open_str, startIndex);
429  int end = page_str.find(e_close_str, start + e_open_str.length());
430  if(start<0 || end<0 || end<start){
431  resultText="";
432  return startIndex;
433  }
434 
435  int length = end + e_close_str.length() - start;
436  string element_str = page_str.substr(start, length);
437 
438  // Find the text
439  start = element_str.find(">") + 1;
440  end = element_str.find("<", start);
441  length = end - start;
442  resultText = element_str.substr(start, length);
443 
444  if (trim) BESUtil::removeLeadingAndTrailingBlanks(resultText);
445 
446  BESDEBUG(MODULE, prolog << "resultText: '" << resultText << "'" << endl);
447  return startIndex + element_str.length();
448 }
449 
450 /*
451  * @brief Returns the catalog node represented by the httpd directory page returned
452  * by dereferencing the passed url.
453  * @param url The url of the Apache httpd directory to process.
454  * @param path The path prefix that associates the location of this generated CatalogNode with it's
455  * correct position in the local service path.
456  */
457 bes::CatalogNode *HttpdDirScraper::get_node(const string &url, const string &path) const
458 {
459  BESDEBUG(MODULE, prolog << "Processing url: '" << url << "'"<< endl);
460  bes::CatalogNode *node = new bes::CatalogNode(path);
461 
462  if (BESUtil::endsWith(url, "/")) {
463  // This always means the URL points to a node when coming from httpd
464  map<string, bes::CatalogItem *> items;
465  createHttpdDirectoryPageMap(url, items);
466 
467  BESDEBUG(MODULE, prolog << "Found " << items.size() << " items." << endl);
468  map<string, bes::CatalogItem *>::iterator it;
469  it = items.begin();
470  while (it != items.end()) {
471  bes::CatalogItem *item = it->second;
472  BESDEBUG(MODULE, prolog << "Adding item: '" << item->get_name() << "'"<< endl);
473  if (item->get_type() == CatalogItem::node)
474  node->add_node(item);
475  else
476  node->add_leaf(item);
477  it++;
478  }
479  }
480  else {
481  // It's a leaf aka "item" response.
482  const BESCatalogUtils *cat_utils = BESCatalogList::TheCatalogList()->find_catalog(BES_DEFAULT_CATALOG)->get_catalog_utils();
483  std::vector<std::string> url_parts = BESUtil::split(url, '/', true);
484  string leaf_name = url_parts.back();
485 
486  CatalogItem *item = new CatalogItem();
487  item->set_type(CatalogItem::leaf);
488  item->set_name(leaf_name);
489  item->set_is_data(cat_utils->is_data(leaf_name));
490 
491  // FIXME: Find the Last Modified date? Head??
492  item->set_lmt(BESUtil::get_time(true));
493 
494  // FIXME: Determine size of this thing? Do we "HEAD" all the leaves?
495  item->set_size(1);
496 
497  node->set_leaf(item);
498  }
499  return node;
500 }
501 
502 #if 0
503 
504 bes::CatalogNode *HttpdDirScraper::get_node(const string &url, const string &path) const
505 {
506  BESDEBUG(MODULE, prolog << "Processing url: '" << url << "'"<< endl);
507  bes::CatalogNode *node = new bes::CatalogNode(path);
508 
509  if (BESUtil::endsWith(url, "/")) {
510 
511  set<string> pageNodes;
512  set<string> pageLeaves;
513  createHttpdDirectoryPageMap(url, pageNodes, pageLeaves);
514 
515  BESDEBUG(MODULE, prolog << "Found " << pageNodes.size() << " nodes." << endl);
516  BESDEBUG(MODULE, prolog << "Found " << pageLeaves.size() << " leaves." << endl);
517 
518  set<string>::iterator it;
519 
520  it = pageNodes.begin();
521  while (it != pageNodes.end()) {
522  string pageNode = *it;
523  if (BESUtil::endsWith(pageNode, "/")) pageNode = pageNode.substr(0, pageNode.length() - 1);
524 
525  bes::CatalogItem *childNode = new bes::CatalogItem();
526  childNode->set_type(CatalogItem::node);
527 
528  childNode->set_name(pageNode);
529  childNode->set_is_data(false);
530 
531  // FIXME: Figure out the LMT if we can... HEAD?
532  childNode->set_lmt(BESUtil::get_time(true));
533 
534  // FIXME: For nodes the size should be the number of children, but how without crawling?
535  childNode->set_size(0);
536 
537  node->add_node(childNode);
538  it++;
539  }
540 
541  it = pageLeaves.begin();
542  while (it != pageLeaves.end()) {
543  string leaf = *it;
544  CatalogItem *leafItem = new CatalogItem();
545  leafItem->set_type(CatalogItem::leaf);
546  leafItem->set_name(leaf);
547 
548  // FIXME: wrangle up the Typematch and see if we think this thing is data or not.
549  leafItem->set_is_data(false);
550 
551  // FIXME: Find the Last Modified date?
552  leafItem->set_lmt(BESUtil::get_time(true));
553 
554  // FIXME: Determine size of this thing? Do we "HEAD" all the leaves?
555  leafItem->set_size(1);
556 
557  node->add_leaf(leafItem);
558  it++;
559  }
560  }
561  else {
562  std::vector<std::string> url_parts = BESUtil::split(url,'/',true);
563  string leaf_name = url_parts.back();
564 
565  CatalogItem *item = new CatalogItem();
566  item->set_type(CatalogItem::leaf);
567  item->set_name(leaf_name);
568  // FIXME: Find the Last Modified date?
569  item->set_lmt(BESUtil::get_time(true));
570 
571  // FIXME: Determine size of this thing? Do we "HEAD" all the leaves?
572  item->set_size(1);
573 
574  node->set_leaf(item);
575 
576  }
577  return node;
578 
579 }
580 #endif
581 
582 }
583  // namespace httpd_catalog
584 
static BESCatalogList * TheCatalogList()
Get the singleton BESCatalogList instance.
bool is_data(const std::string &item) const
is there a handler that can process this
virtual BESCatalogUtils * get_catalog_utils() const
Get a pointer to the utilities, customized for this catalog.
Definition: BESCatalog.h:113
static bool IsSet(const std::string &flagName)
see if the debug context flagName is set to true
Definition: BESDebug.h:160
exception thrown if internal error encountered
static std::vector< std::string > split(const std::string &s, char delim='/', bool skip_empty=true)
Splits the string s into the return vector of tokens using the delimiter delim and skipping empty val...
Definition: BESUtil.cc:1125
static bool endsWith(std::string const &fullString, std::string const &ending)
Definition: BESUtil.cc:942
static void tokenize(const std::string &str, std::vector< std::string > &tokens, const std::string &delimiters="/")
Definition: BESUtil.cc:1057
static std::string lowercase(const std::string &s)
Definition: BESUtil.cc:200
static void removeLeadingAndTrailingBlanks(std::string &key)
Definition: BESUtil.cc:466
static std::string get_time(bool use_local_time=false)
Definition: BESUtil.cc:1079
void set_name(std::string n)
Set the name of the item.
Definition: CatalogItem.h:135
std::string get_name() const
The name of this item in the node.
Definition: CatalogItem.h:133
void set_size(size_t s)
Set the size of the item.
Definition: CatalogItem.h:140
void set_is_data(bool id)
Is this item data that the BES should interpret?
Definition: CatalogItem.h:150
void set_lmt(std::string lmt)
Set the LMT for this item.
Definition: CatalogItem.h:145
item_type get_type() const
Get the type of this item (unknown, node or leaf)
Definition: CatalogItem.h:153
void set_type(item_type t)
Set the type for this item.
Definition: CatalogItem.h:155