Hello guys! I'm trying to learn the Boost regex library for a more-convenient means of error-checking and the asio library to allow a fairly portable-means of manipulating data over TCP/IP.
My goal is to retrieve the file information from a website (basically http layer).
My first problem is determining the Regex for an IP. I've tried the following Regex's--
"^w{3}\\.{1}[.]+\\.{1}[.]+"
"^w{3}" // got desperate here, and it didn't work!
-- the first one I tried to match against 3 w's at the start of the line, a period, and then other characters, then another period and finally whatever other characters to follow. I'm aware that some websites don't require the www prefix, I just wanted to do something interesting for checking to experiment around. Apparently, I misunderstood the Regex syntax I've looked up on Wikipedia and Boost/regex. Any help with this would be much appreciated.
My second problem is the understanding of how to make my client-program retrieve a server-file from the internet much like a web-browser does. I can't stress enough that I'm mainly experimenting with this to see if it can be done, so hopefully someone that is network savvy and/or someone with experience with Boost asio can help me with the next part. Here is my code (and please bare with me I know it's ugly right now, but I have a format for debugging each function that is really helpful to me)--
#ifndef INTERNET_DATA_PARSER_H
#define INTERNET_DATA_PARSER_H
#include <string>
#include <map>
#include <ostream>
#include <iostream>
using std::string;
using std::map;
using std::cout;
using std::ostream;
/**
* The purpose of this class is to do the following--
*
* - Open a link to a specified URL and apply a pattern-match to retrieve all valid matches
* - of the pattern, then send the data to some output (cout, a stringstream, a file, etc).
*/
namespace Parser{
typedef class InternetDataParser{
public:
InternetDataParser(const string& = "style=([^>.]+)>", const string& = "www.google.com", ostream& = std::cout);
string setInfo(const string&, const string&, ostream&);
void setPattern(const string&);
string setURL(const string&);
void setOutputStream(ostream&);
string write();
string getPattern();
string getURL();
ostream& getOstream();
static string getReason(int);
private:
string pattern;
string url;
ostream* out;
static map<int, string> reasons;
static bool initNotCalled;
static void init();
} IDP;
};
#endif
#include "InternetDataParser.h"
#include <string>
#include <map>
#include <ostream>
#include <iostream>
#include <boost/regex.hpp>
#include <boost/array.hpp>
#include <boost/asio.hpp>
using std::string;
using std::map;
using std::pair;
using std::copy;
using std::stringstream;
using Parser::IDP;
using boost::asio::ip::tcp;
bool IDP::initNotCalled = true;
map<int, string> IDP::reasons = map<int, string>();
/*
* Attempts to assign all necessary information at once upon construction.
*
* This function also calls IDP::init() and initializes the static members
*/
IDP::InternetDataParser(const string& p, const string& u, ostream& o) : pattern(p), url(u), out(&o) {
IDP::init();
}
/*
* Attempts to set the Pattern, URL and output stream. Only fails if an invalid URL is passed.
*
* -Input: The Pattern, URL and output stream
* -Returns a reason (reason for failure, or "Good" if the URL is a good URL)
*/
string IDP::setInfo(const string& p, const string& u, ostream& o){
if(strcmp(setURL(u).c_str(), reasons[6].c_str()) == 0)
return reasons[6];
else{
setPattern(p);
setOutputStream(o);
}
return reasons[0];
}
/*
* Sets the member variable pattern with the specified pattern
*/
void IDP::setPattern(const string& p){
pattern = p;
}
/*
* Attempts to set the URL to the value specified.
*
* -Returns the reason for failure, or "Good" if it did not fail
*/
string IDP::setURL(const string& u){
string lower_u;
for(size_t i = 0; i < u.length(); i++)
lower_u += tolower(u[i]);
// Problem here also
static const boost::regex url_format ("w{3}\\.{1}[.]+\\.{1}[.]+");
(*out) << lower_u << std::endl;
if( !boost::regex_match( lower_u, url_format ) ){
(*out) << reasons[6] << std::endl;
return reasons[6];
}else url = lower_u;
return reasons[0];
}
/*
* Sets the ostream& out member with the ostream provided
*/
void IDP::setOutputStream(ostream& o){
out = &o;
}
/*
* Retrieves data from the URL and writes it to the output stream
*
* -Returns the reason for failure, or "Good" if nothing failed.
*/
string IDP::write(){
// MOST OF THIS LOGIC WAS BORROWED DIRECTLY FROM Boost asio!!!!
// Please help me understand the network/application layer of TCP to http
// for retrieving file data on the internet!
try{
boost::asio::io_service io_service;
tcp::resolver resolver(io_service);
tcp::resolver::query query( url.c_str() , "http");
tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);
tcp::resolver::iterator end;
tcp::socket socket(io_service);
boost::system::error_code error = boost::asio::error::host_not_found;
while (error && endpoint_iterator != end){
socket.close();
socket.connect(*endpoint_iterator++, error);
}
if (error){
return reasons[7];
}
for (;;){
boost::array<char, 128> buf;
boost::system::error_code error;
(*out) << socket.available();
size_t len = socket.receive(boost::asio::buffer(buf));
(*out).write(buf.data(), len);
// When the server closes the connection, the ip::tcp::socket::read_some() function will exit with the
// boost::asio::error::eof error, which is how we know to exit the loop.
if (error == boost::asio::error::eof){
(*out) << reasons[8];
break; // Connection closed cleanly by peer.
}else if (error)
throw boost::system::system_error(error); // Some other error.
}
}catch (std::exception& e){
std::cerr << e.what() << std::endl;
}
return reasons[0];
}
/*
* -Returns the regex pattern assigned to 'pattern' member variable
*/
string IDP::getPattern(){
return pattern;
}
/*
* Simply returns the URL as a string
*
* -Returns the URL as a string
*/
string IDP::getURL(){
return url;
}
/*
* Simply returns the output stream currently used by this IDP
*
* -Returns the ostream& used by this object.
*/
ostream& IDP::getOstream(){
return *out;
}
/*
* A reason is nothing more than a stored string in the reasons map.
* The reason represents what happened after a function call.
*
* -Input: The reason code.
* -Returns the string representation of the reason code.
*/
string IDP::getReason(int r){
IDP::init();
return ( (reasons.find(r) != reasons.end() ) ? reasons[r] : "Reason not found." );
}
/*
* Initializes the static map 'reasons'
*/
void IDP::init(){
if(initNotCalled){
initNotCalled = false;
reasons.insert( pair<int, string>(0, "Good") );
reasons.insert( pair<int, string>(1, "No matches found") );
reasons.insert( pair<int, string>(2, "Unable to open URL") );
reasons.insert( pair<int, string>(3, "Unable to open File") );
reasons.insert( pair<int, string>(4, "Unable to write to File") );
reasons.insert( pair<int, string>(6, "Invalid URL") );
reasons.insert( pair<int, string>(7, "No Host Found") );
reasons.insert( pair<int, string>(8, "Connection closed.") );
}
}
#include <iostream>
#include "InternetDataParser.h"
using namespace std;
using namespace Parser;
int main(){
IDP idp;
idp.setURL("www.yahoo.com");// fails even though it should match regex?
idp.write();
cin.get();
return 0;
}
-- After running the application, I see one number (the number of bytes that can be read before a block) which is 0, and then "End of File". I don't see any repeats of numbers or and characters being printed to the console screen which means after the function call of receive, it immediately caught an error and the IDP::write function returns. What I'd like to know is how do I not use TCP to wait for a specific server response other than the data of the file itself? How do I stream the bytes of the file into the buffer?
Any help fixing my code would be much appreciated. Thanks!
EDIT:
I found this link from boost from the advanced examples section--
http://www.boost.org/doc/libs/1_44_0/doc/html/boost_asio/example/http/client/async_client.cpp
-- now I just need to figure out how to apply regex and I'm good to go!