core/vul/vul_url.cxx
Go to the documentation of this file.
00001 // This is core/vul/vul_url.cxx
00002 #ifdef VCL_NEEDS_PRAGMA_INTERFACE
00003 #pragma implementation
00004 #endif
00005 //:
00006 // \file
00007 // \author Ian Scott
00008 // Based on vil_stream_url by fsm
00009 // \verbatim
00010 //  Modifications
00011 //   8 Nov 2002 - Peter Vanroose - corrected HTTP client request syntax
00012 // \endverbatim
00013 
00014 #include "vul_url.h"
00015 #include <vcl_cstdio.h>  // sprintf()
00016 #include <vcl_cstring.h>
00017 #include <vcl_cstdlib.h>
00018 #include <vcl_sstream.h>
00019 #include <vcl_cassert.h>
00020 #include <vcl_fstream.h>
00021 #include <vul/vul_file.h>
00022 
00023 #if defined(unix) || defined(__unix) || defined(__unix__)
00024 
00025 # include <unistd.h>       // read(), write(), close()
00026 # include <netdb.h>        // gethostbyname(), sockaddr_in()
00027 # include <sys/socket.h>
00028 # include <netinet/in.h>   // htons()
00029 # ifdef __alpha
00030 #  include <fp.h>          // htons() [ on e.g. DEC alpha, htons is in machine/endian.h ]
00031 # endif
00032 # define SOCKET int
00033 
00034 #elif defined (VCL_WIN32) && !defined(__CYGWIN__)
00035 
00036 # include <winsock2.h>
00037 
00038 #endif // unix
00039 
00040 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00041 // So that we don't call WSAStartup more than we need to
00042 static int called_WSAStartup = 0;
00043 #endif
00044 
00045 //: only call this method with a correctly formatted http URL
00046 vcl_istream * vul_http_open(char const *url)
00047 {
00048   // split URL into auth, host, path and port number.
00049   vcl_string host;
00050   vcl_string path;
00051   vcl_string auth;
00052   int port = 80; // default
00053 
00054   // check it is an http URL.
00055   assert (vcl_strncmp(url, "http://", 7) == 0);
00056 
00057   char const *p = url + 7;
00058   while (*p && *p!='/')
00059     ++ p;
00060   host = vcl_string(url+7, p);
00061 
00062 
00063   if (*p)
00064     path = p+1;
00065   else
00066     path = "";
00067 
00068   //authentication
00069   for (unsigned int i=0; i<host.size(); ++i)
00070     if (host[i] == '@') {
00071       auth = vcl_string(host.c_str(), host.c_str()+i);
00072       host = vcl_string(host.c_str()+i+1, host.c_str() + host.size());
00073       break;
00074     }
00075 
00076   // port?
00077   if (host.size() > 0L)
00078   for (unsigned int i=(unsigned int)(host.size()-1); i>0; --i)
00079     if (host[i] == ':') {
00080       port = vcl_atoi(host.c_str() + i + 1);
00081       host = vcl_string(host.c_str(), host.c_str() + i);
00082       break;
00083     }
00084 
00085   // do character translation
00086   unsigned k =0;
00087   while (k < path.size())
00088   {
00089     if (path[k] == ' ')
00090       path.replace(k, 1, "%20");
00091     else if (path[k] == '%')
00092       path.replace(k, 1, "%25");
00093     ++k;
00094   }
00095 
00096   // so far so good.
00097 #ifdef DEBUG
00098   vcl_cerr << "auth = \'" << auth << "\'\n"
00099            << "host = \'" << host << "\'\n"
00100            << "path = \'" << path << "\'\n"
00101            << "port = " << port << vcl_endl;
00102 #endif
00103 
00104 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00105   if (called_WSAStartup==0)
00106   {
00107     WORD wVersionRequested;
00108     WSADATA wsaData;
00109 
00110     wVersionRequested = MAKEWORD( 2, 2 );
00111 
00112     /* int err = */ WSAStartup( wVersionRequested, &wsaData );
00113   }
00114 #endif
00115 
00116   // create socket endpoint.
00117   SOCKET tcp_socket = socket(PF_INET,      // IPv4 protocols.
00118                              SOCK_STREAM,  // two-way, reliable,
00119                                            // connection-based stream socket.
00120                              PF_UNSPEC);   // protocol number.
00121 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00122   if (tcp_socket == INVALID_SOCKET) {
00123 # ifndef NDEBUG
00124     vcl_cerr << __FILE__ "error code : " << WSAGetLastError() << '\n';
00125 # endif
00126 #else
00127   if (tcp_socket < 0) {
00128 #endif
00129     vcl_cerr << __FILE__ ": failed to create socket.\n";
00130     return 0;
00131   }
00132 
00133 #ifdef DEBUG
00134   vcl_cerr << __FILE__ ": tcp_socket = " << tcp_socket << '\n';
00135 #endif
00136 
00137   // get network address of server.
00138   hostent *hp = gethostbyname(host.c_str());
00139   if (! hp) {
00140     vcl_cerr << __FILE__ ": failed to lookup host\n";
00141 
00142 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00143     closesocket(tcp_socket);
00144 #else
00145     close(tcp_socket);
00146 #endif
00147 
00148     return 0;
00149   }
00150 
00151   // make socket address.
00152   sockaddr_in my_addr;
00153   my_addr.sin_family = AF_INET;
00154   // convert port number to network byte order..
00155   my_addr.sin_port = htons(port);
00156   vcl_memcpy(&my_addr.sin_addr, hp->h_addr_list[0], hp->h_length);
00157 
00158   // connect to server.
00159   if (connect(tcp_socket , (sockaddr *) &my_addr, sizeof my_addr) < 0) {
00160     vcl_cerr << __FILE__ ": failed to connect to host\n";
00161     //perror(__FILE__);
00162 
00163 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00164     closesocket(tcp_socket);
00165 #else
00166     close(tcp_socket);
00167 #endif
00168 
00169     return 0;
00170   }
00171 
00172   // buffer for data transfers over socket.
00173   char buffer[4096];
00174 
00175   // send HTTP 1.1 request.
00176   vcl_snprintf(buffer, 4090-vcl_strlen(buffer),
00177                "GET %s HTTP/1.1\r\nUser-Agent: vul_url\r\nHost: %s\r\nAccept: */*\r\n",
00178                url, host.c_str());
00179 
00180   if (auth != "")
00181     vcl_snprintf(buffer+vcl_strlen(buffer), 4090-vcl_strlen(buffer),
00182                  "Authorization: Basic %s\r\n",
00183                  vul_url::encode_base64(auth).c_str());
00184 
00185   if (vcl_snprintf(buffer+vcl_strlen(buffer), 4090-vcl_strlen(buffer), "\r\n") < 0)
00186   {
00187     vcl_cerr << "ERROR: vul_http_open buffer overflow.";
00188     vcl_abort();
00189   }
00190 
00191 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00192   if (send(tcp_socket, buffer, (int)vcl_strlen(buffer), 0) < 0) {
00193 #else
00194   if (::write(tcp_socket, buffer, vcl_strlen(buffer)) < 0) {
00195 #endif
00196     vcl_cerr << __FILE__ ": error sending HTTP request\n";
00197 
00198 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00199     closesocket(tcp_socket);
00200 #else
00201     close(tcp_socket);
00202 #endif
00203     return 0;
00204   }
00205 
00206 
00207   // read from socket into memory.
00208   vcl_string contents;
00209   {
00210     int n;
00211 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00212     while ((n = recv(tcp_socket, buffer, sizeof buffer,0 )) > 0) {
00213 #else
00214     while ((n = ::read(tcp_socket, buffer, sizeof buffer)) > 0) {
00215 #endif
00216       contents.append(buffer, n);
00217 #ifdef DEBUG
00218       vcl_cerr << n << " bytes\n";
00219 #endif
00220     }
00221   }
00222 
00223   // close connection to server.
00224 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00225   closesocket(tcp_socket);
00226 #else
00227   close(tcp_socket);
00228 #endif
00229 
00230 #ifdef DEBUG
00231   vcl_cerr << "HTTP server returned:\n" << contents << '\n';
00232 #endif
00233 
00234   if (contents.find("HTTP/1.1 200") == contents.npos)
00235   {
00236     return 0;
00237   }
00238   vcl_string::size_type n = contents.find("\r\n\r\n");
00239   if (n == contents.npos)
00240   {
00241     return 0;
00242   }
00243 
00244   contents.erase(0,n+4);
00245 #ifdef DEBUG
00246   vcl_cerr << "vul_url::vul_http_open() returns:\n" << contents << '\n';
00247 #endif
00248   return new vcl_istringstream(contents);
00249 }
00250 
00251 
00252 //: only call this method with a correctly formatted http URL
00253 bool vul_http_exists(char const *url)
00254 {
00255   // split URL into auth, host, path and port number.
00256   vcl_string host;
00257   vcl_string path;
00258   vcl_string auth;
00259   int port = 80; // default
00260   assert (vcl_strncmp(url, "http://", 7) == 0);
00261 
00262   char const *p = url + 7;
00263   while (*p && *p!='/')
00264     ++ p;
00265   host = vcl_string(url+7, p);
00266 
00267 
00268   if (*p)
00269     path = p+1; // may be the empty string, if URL ends in a slash
00270   else
00271     path = "";
00272 
00273   //authentication
00274   for (unsigned int i=0; i<host.size(); ++i)
00275     if (host[i] == '@') {
00276       auth = vcl_string(host.c_str(), host.c_str()+i);
00277       host = vcl_string(host.c_str()+i+1, host.c_str() + host.size());
00278       break;
00279     }
00280 
00281   // port?
00282   for (unsigned int i=0; i<host.size(); ++i)
00283     if (host[i] == ':') {
00284       port = vcl_atoi(host.c_str() + i + 1);
00285       host = vcl_string(host.c_str(), host.c_str() + i);
00286       break;
00287     }
00288 
00289   // do character translation
00290   unsigned k =0;
00291   while (k < path.size())
00292   {
00293     if (path[k] == ' ')
00294       path.replace(k, 1, "%20");
00295     else if (path[k] == '%')
00296       path.replace(k, 1, "%25");
00297     k++;
00298   }
00299 
00300   // so far so good.
00301 #ifdef DEBUG
00302   vcl_cerr << "auth = \'" << auth << "\'\n"
00303            << "host = \'" << host << "\'\n"
00304            << "path = \'" << path << "\'\n"
00305            << "port = " << port << vcl_endl;
00306 #endif
00307 
00308 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00309   if (called_WSAStartup==0)
00310   {
00311     WORD wVersionRequested;
00312     WSADATA wsaData;
00313 
00314     wVersionRequested = MAKEWORD( 2, 2 );
00315 
00316     /* int err = */ WSAStartup( wVersionRequested, &wsaData );
00317   }
00318 #endif
00319 
00320   // create socket endpoint.
00321   SOCKET tcp_socket = socket(PF_INET,      // IPv4 protocols.
00322                              SOCK_STREAM,  // two-way, reliable,
00323                                            // connection-based stream socket.
00324                              PF_UNSPEC);   // protocol number.
00325 
00326 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00327   if (tcp_socket == INVALID_SOCKET) {
00328 # ifndef NDEBUG
00329     vcl_cerr << "error code : " << WSAGetLastError() << vcl_endl;
00330 # endif
00331 #else
00332   if (tcp_socket < 0) {
00333 #endif
00334     vcl_cerr << __FILE__ ": failed to create socket.\n";
00335     return false;
00336   }
00337 
00338 #ifdef DEBUG
00339   vcl_cerr << __FILE__ ": tcp_socket = " << tcp_socket << vcl_endl;
00340 #endif
00341 
00342   // get network address of server.
00343   hostent *hp = gethostbyname(host.c_str());
00344   if (! hp) {
00345     vcl_cerr << __FILE__ ": failed to lookup host\n";
00346     return false;
00347   }
00348 
00349   // make socket address.
00350   sockaddr_in my_addr;
00351   my_addr.sin_family = AF_INET;
00352     // convert port number to network byte order..
00353   my_addr.sin_port = htons(port);
00354   vcl_memcpy(&my_addr.sin_addr, hp->h_addr_list[0], hp->h_length);
00355 
00356   // connect to server.
00357   if (connect(tcp_socket , (sockaddr *) &my_addr, sizeof my_addr) < 0)
00358   {
00359     vcl_cerr << __FILE__ ": failed to connect to host\n";
00360     //perror(__FILE__);
00361 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00362     closesocket(tcp_socket);
00363 #else
00364     close(tcp_socket);
00365 #endif
00366 
00367     return false;
00368   }
00369 
00370   // buffer for data transfers over socket.
00371   char buffer[4096];
00372 
00373   // send HTTP 1.1 request.
00374   vcl_snprintf(buffer, 4090,
00375                "HEAD %s HTTP/1.1\r\nUser-Agent: vul_url\r\nHost: %s\r\nAccept: */*\r\n",
00376                url, host.c_str());
00377   if (auth != "")
00378     vcl_snprintf(buffer+vcl_strlen(buffer), 4090-vcl_strlen(buffer),
00379                  "Authorization: Basic %s\r\n",
00380                  vul_url::encode_base64(auth).c_str() );
00381 
00382   if (vcl_snprintf(buffer+vcl_strlen(buffer), 4090-vcl_strlen(buffer), "\r\n") < 0)
00383   {
00384     vcl_cerr << "ERROR: vul_http_exists buffer overflow.";
00385     vcl_abort();
00386   }
00387 
00388 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00389   if (send(tcp_socket, buffer, (int)vcl_strlen(buffer), 0) < 0) {
00390 #else
00391   if (::write(tcp_socket, buffer, vcl_strlen(buffer)) < 0) {
00392 #endif
00393     vcl_cerr << __FILE__ ": error sending HTTP request\n";
00394 
00395 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00396     closesocket(tcp_socket);
00397 #else
00398     close(tcp_socket);
00399 #endif
00400     return false;
00401   }
00402 
00403 
00404   // read from socket into memory.
00405   vcl_string contents;
00406   {
00407     int n;
00408 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00409     if ((n = recv(tcp_socket, buffer, sizeof buffer,0 )) > 0) {
00410 #else
00411     if ((n = ::read(tcp_socket, buffer, sizeof buffer)) > 0) {
00412 #endif
00413       contents.append(buffer, n);
00414       //vcl_cerr << n << " bytes\n";
00415     }
00416     else
00417     {
00418 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00419       closesocket(tcp_socket);
00420 #else
00421       close(tcp_socket);
00422 #endif
00423       return false;
00424     }
00425   }
00426 
00427   // close connection to server.
00428 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00429   closesocket(tcp_socket);
00430 #else
00431   close(tcp_socket);
00432 #endif
00433 
00434 #ifdef DEBUG
00435   vcl_cerr << "HTTP server returned:\n" << contents << '\n';
00436 #endif
00437 
00438   return contents.find("HTTP/1.1 200") != contents.npos;
00439 }
00440 
00441 
00442 vcl_istream * vul_url::open(const char * url, vcl_ios_openmode mode)
00443 {
00444   // check for null pointer or empty strings.
00445   if (!url || !*url)
00446     return 0;
00447   unsigned int l = (unsigned int)vcl_strlen(url);
00448 
00449   // check for filenames beginning "file:".
00450   if (l > 7 && vcl_strncmp(url, "file://", 7) == 0)
00451     return new vcl_ifstream(url+7,mode);
00452 
00453   // maybe it's an http URL?
00454   if (l > 7 && vcl_strncmp(url, "http://", 7) == 0)
00455     return vul_http_open(url);
00456 
00457   // maybe it's an ftp URL?
00458   if (l > 6 && vcl_strncmp(url, "ftp://", 6) == 0)
00459   {
00460     vcl_cerr << __LINE__ << "ERROR:\n vul_read_url(const char * url)\n"
00461       "Doesn't support FTP yet, url=" << url << vcl_endl;
00462     return 0;
00463   }
00464 
00465   // try an ordinary filename
00466   return new vcl_ifstream(url, mode);
00467 }
00468 
00469 
00470 //: Does that URL exist
00471 bool vul_url::exists(const char * url)
00472 {
00473   // check for null pointer or empty strings.
00474   if (!url || !*url)
00475     return false;
00476   unsigned int l = (unsigned int)vcl_strlen(url);
00477 
00478   // check for filenames beginning "file:".
00479   if (l > 7 && vcl_strncmp(url, "file://", 7) == 0)
00480     return vul_file::exists(url+7);
00481 
00482   // maybe it's an http URL?
00483   if (l > 7 && vcl_strncmp(url, "http://", 7) == 0)
00484     return vul_http_exists(url);
00485 
00486   // maybe it's an ftp URL?
00487   if (l > 6 && vcl_strncmp(url, "ftp://", 6) == 0)
00488   {
00489     vcl_cerr << "ERROR: vul_read_url(const char * url)\n"
00490       "Doesn't support FTP yet, url=" << url << vcl_endl;
00491     return false;
00492   }
00493 
00494   // try an ordinary filename
00495   return vul_file::exists(url);
00496 }
00497 
00498 //: Is that a URL
00499 bool vul_url::is_url(const char * url)
00500 {
00501   // check for null pointer or empty strings.
00502   if (!url || !*url)
00503     return false;
00504   unsigned int l = (unsigned int)vcl_strlen(url);
00505 
00506   // check for filenames beginning "file:".
00507   if (l > 7 && vcl_strncmp(url, "file://", 7) == 0)
00508     return true;
00509 
00510   // maybe it's an http URL?
00511   if (l > 7 && vcl_strncmp(url, "http://", 7) == 0)
00512     return true;
00513 
00514   // maybe it's an ftp URL?
00515   if (l > 6 && vcl_strncmp(url, "ftp://", 6) == 0)
00516     return true;
00517 
00518   return false;
00519 }
00520 
00521 //=======================================================================
00522 
00523 bool vul_url::is_file(const char * fn)
00524 {
00525   if (vul_url::is_url(fn))
00526     return vul_url::exists(fn);
00527   else
00528     return vul_file::exists(fn) && ! vul_file::is_directory(fn);
00529 }
00530 
00531 //=======================================================================
00532 
00533 static const
00534 char base64_encoding[]=
00535 {
00536   'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P',
00537   'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f',
00538   'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v',
00539   'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/'
00540 };
00541 
00542 static char out_buf[4];
00543 
00544 static const char * encode_triplet(char data[3], unsigned int n)
00545 {
00546   assert (n>0 && n <4);
00547   out_buf[0] = base64_encoding[(data[0] & 0xFC) >> 2];
00548   out_buf[1] = base64_encoding[
00549     ((data[0] & 0x3) << 4) + ((data[1] & 0xf0)>>4)];
00550 
00551   if (n==1)
00552   {
00553     out_buf[2] = out_buf[3] = '=';
00554     return out_buf;
00555   }
00556 
00557   out_buf[2] = base64_encoding[
00558     ((data[1] & 0xf) << 2) + ((data[2] & 0xc0)>>6)];
00559 
00560   if (n==2)
00561   {
00562     out_buf[3] = '=';
00563     return out_buf;
00564   }
00565 
00566   out_buf[3] = base64_encoding[ (data[2] & 0x3f) ];
00567   return out_buf;
00568 }
00569 
00570 //=======================================================================
00571 
00572 vcl_string vul_url::encode_base64(const vcl_string& in)
00573 {
00574   vcl_string out;
00575   unsigned int i = 0, line_octets = 0;
00576   const unsigned int l = (unsigned int)(in.size());
00577   char data[3];
00578   while (i <= l)
00579   {
00580     if (i == l)
00581     {
00582       out.append("=");
00583       return out;
00584     }
00585 
00586     data[0] = in[i++];
00587     data[1] = data[2] = 0;
00588 
00589     if (i == l)
00590     {
00591       out.append(encode_triplet(data,1),4);
00592       return out;
00593     }
00594 
00595     data[1] = in[i++];
00596 
00597     if (i == l)
00598     {
00599       out.append(encode_triplet(data,2),4);
00600       return out;
00601     }
00602 
00603     data[2] = in[i++];
00604 
00605     out.append(encode_triplet(data,3),4);
00606 
00607     if (line_octets >= 68/4) // print carriage return
00608     {
00609       out.append("\r\n",2);
00610       line_octets = 0;
00611     }
00612     else
00613       ++line_octets;
00614   }
00615 
00616   return out;
00617 }
00618 
00619 //=======================================================================
00620 
00621 static int get_next_char(const vcl_string &in, unsigned int *i)
00622 {
00623   while (*i < in.size())
00624   {
00625     char c;
00626     c = in[(*i)++];
00627 
00628     if (c == '+')
00629       return 62;
00630 
00631     if (c == '/')
00632       return 63;
00633 
00634     if (c >= 'A' && c <= 'Z')
00635       return 0 + (int)c - (int)'A';
00636 
00637     if (c >= 'a' && c <= 'z')
00638       return 26 + (int)c - (int)'a';
00639 
00640     if (c >= '0' && c <= '9')
00641       return 52 + (int)c - (int)'0';
00642 
00643     if (c == '=')
00644       return 64;
00645   }
00646   return -1;
00647 }
00648 
00649 //=======================================================================
00650 
00651 vcl_string vul_url::decode_base64(const vcl_string& in)
00652 {
00653   int c;
00654   char data[3];
00655 
00656   unsigned int i=0;
00657   const unsigned int l = (unsigned int)(in.size());
00658   vcl_string out;
00659   while (i < l)
00660   {
00661     data[0] = data[1] = data[2] = 0;
00662 
00663     // -=- 0 -=-
00664     // Search next valid char...
00665     c = get_next_char(in , &i);
00666 
00667     // treat '=' as end of message
00668     if (c == 64)
00669       return out;
00670     if (c==-1)
00671       return "";
00672 
00673     data[0] = char(((c & 0x3f) << 2) | (0x3 & data[0]));
00674 
00675     // -=- 1 -=-
00676     // Search next valid char...
00677     c = get_next_char(in , &i);
00678 
00679       // Error! Second character in octet can't be '='
00680     if (c == 64 || c==-1)
00681       return "";
00682 
00683     data[0] = char(((c & 0x30) >> 4) | (0xfc & data[0]));
00684     data[1] = char(((c & 0x0f) << 4) | (0x0f & data[1]));
00685 
00686     // -=- 2 -=-
00687     // Search next valid char...
00688 
00689     c = get_next_char(in , &i);
00690 
00691     if (c==-1)
00692       return "";
00693     if (c == 64)
00694     {
00695       // should really read next char and check it is '='
00696       out.append(data,1);  // write 1 byte to output
00697       return out;
00698     }
00699 
00700     data[1] = char(((c & 0x3c) >> 2) | (0xf0 & data[1]));
00701     data[2] = char(((c & 0x03) << 6) | (0x3f & data[2]));
00702 
00703     // -=- 3 -=-
00704     // Search next valid char...
00705     c = get_next_char(in , &i);
00706 
00707     if (c==-1)
00708       return "";
00709 
00710     if (c == 64)
00711     {
00712       out.append(data,2);  // write 2 bytes to output
00713       return out;
00714     }
00715 
00716     data[2] = char((c & 0x3f) | (0xc0 & data[2]));
00717 
00718     out.append(data,3);  // write 3 bytes to output
00719   }
00720 
00721   return out;
00722 }