/* $Id: tcp_bw_exp.c,v 1.110 2005/05/09 20:48:34 graziano Exp $ */

#include "config_nws.h"

#include <sys/types.h>     /* system type definitions */
#include <stdlib.h> 
#include <assert.h>        
#include <string.h>        /* memset */
#include <netinet/in.h>    /* sometimes required for #include <arpa/inet.h> */
#include <sys/socket.h>    /* [gs]etsockopt */
#include <sys/time.h>      /* struct timeval */
#include <unistd.h>        /* select (some systems) */
#include <errno.h>

#include "protocol.h"
#include "diagnostic.h"
#include "osutil.h"
#include "messages.h"
#include "tcp_bw_exp.h"
#include "timeouts.h"
#include "host_protocol.h"
#include "strutil.h"


#define REUSE_CONNECTION 0
#define TCP_HANDSHAKE 411
#define TIMEOUT_FEEDBACK (TCP_HANDSHAKE + 1)

/* this is somewhat a kludge, since we need to know if to fork whenver
 * receiving a TCP_BW_REQ message. We receive the info to fork from the
 * options when UseSkill is invoked. #bufferMultiply# contains the
 * muliply factor for the socket buffer (linux uses double sizes then
 * asked for). */
static int doFork = 1;
static int myPort = 0;
static int bufferMultiply = 1;

typedef struct {
	IPAddress ipAddr; /* Reserved for future use. */
	unsigned short portNum;
} TcpHandshake;
static const DataDescriptor tcpHandshakeDescriptor[] =
	{SIMPLE_MEMBER(UNSIGNED_INT_TYPE, 1, offsetof(TcpHandshake, ipAddr)),
	SIMPLE_MEMBER(UNSIGNED_SHORT_TYPE, 1, offsetof(TcpHandshake, portNum))};
#define tcpHandshakeDescriptorLength 2

typedef struct {
	short type;             /* holds the timeout type */
	unsigned long address;  /* the target address */
	double duration;        /* how long it took */
	long size;              /* the size of the transmission */
	short timedOut;         /* did the connection timed out? */
} timeoutStruct;
static const DataDescriptor timeoutDescriptor[] =
   {SIMPLE_MEMBER(SHORT_TYPE, 1, offsetof(timeoutStruct, type)),
    SIMPLE_MEMBER(UNSIGNED_LONG_TYPE, 1, offsetof(timeoutStruct, address)),
    SIMPLE_MEMBER(DOUBLE_TYPE, 1, offsetof(timeoutStruct, duration)),
    SIMPLE_MEMBER(LONG_TYPE, 1, offsetof(timeoutStruct, size)),
    SIMPLE_MEMBER(SHORT_TYPE, 1, offsetof(timeoutStruct, timedOut))};
#define timeoutDescriptorLength (5)


/*
 * Interacts with the peer connected to #peer# to perform the TCP experiment.
 * #sender# indicates whether this process is the sender (non-zero) or receiver
 * (zero) of the data.  Latency and bandwidth values are returned in #results#.
 * #childToParent# is a socket (pipe) used to communicate to the parent
 * the data for the automatic timeout. Return 0 on failure.
 */
static int
DoTcpExp(	Socket *peer,
		const TcpBwCtrl *sizes,
		int sender,
		double *results,
		Socket *childToParent) {
	int bytesThisMessage, bytesThisCall, bytesTotal, result, tmp, failed;
	char *expData;
	fd_set peerFDs;
	double timeStart, timeStop, recvTO, sendTO;
	struct timeval timeOut;
	IPAddress addr;
	timeoutStruct t;

	/* Allocate enough data to cover the bandwidth experiment. */
	if ((expData = (char *)malloc(sizes->msgSize)) == NULL) {
		FAIL1("DoTcpExp: malloc %d failed\n", sizes->msgSize);
	}
	/* make memcheckers happy */
	memset(expData, 0, sizes->msgSize);

	/* let's get information on the caller (needed later on) */
	addr = Peer(*peer);
	
	/* this is for the automatic timeout discovery */
	t.type =  sender ?  SEND: RECV;

	/* Latency experiment ... */
	result = 0;

	/* to save cycle we compute the timeout values once (and we do it
	 * optimistically: it should be allright because we should be in
	 * our own process doing this work */
	sendTO = GetTimeOut(SEND, addr, sizes->expSize);
	recvTO = GetTimeOut(RECV, addr, sizes->expSize);
	DDEBUG2("DoTcpExp: using %.0fs/%.0fs timeout for send/recv\n", sendTO, recvTO);
#ifdef USE_ALARM_SIGNAL
  	SetRealTimer((int)(sender ? sendTO : recvTO));
#endif
	timeStart = MicroTime();
	if (sender) {
#ifdef USE_NONBLOCKING
		/* if we are using NONBLOCKING socket there are way too
		 * many things to do here: so I call Send/RecvBytes
		 * directly. Given that we are sending/receiven only 1
		 * bytes we just can have a 1/0 result: easy to parse. */
		if (SendBytes(*peer, expData, 1, sendTO)) {
			result = RecvBytes(*peer, expData, 1, recvTO);
		}
#else
		/* we use write instead of send because they
		 * should be the same when no flags, and write
		 * works with pipes too */
		result = write(*peer, expData, 1);
		if (result > 0) {
			/* we use read instead of recv because they
			 * should be the same when no flags, and write
			 * works with pipes too */
			result = read(*peer, expData, 1);
		}
#endif
	} else {
#ifdef USE_NONBLOCKING
		/* Given that we are sending/receiven only 1
		 * bytes we just can have a 1/0 result: easy to parse. */
		if (RecvBytes(*peer, expData, 1, recvTO)) {
			result = SendBytes(*peer, expData, 1, sendTO);
		}
#else
		/* we use read instead of recv because they
		 * should be the same when no flags, and write
		 * works with pipes too */
		result = read(*peer, expData, 1);
		if (result > 0) {
			/* we use write instead of send because they
			 * should be the same when no flags, and write
			 * works with pipes too */
			result = write(*peer, expData, 1);
		}
#endif
	}
	timeStop = MicroTime();

	/* we want to set the automatic timeout discovery: if we forked
	 * we need to talk to the parent, otherwise we need to call the
	 * function directly. Here we send it only if we failed (to let
	 * the timeout to grow). If all goes well we send only the one
	 * with the bw (more byte sent perhaps better number?). */
	t.timedOut = (result <= 0);
	if (t.timedOut) {
		char *name; 

#ifdef USE_ALARM_SIGNAL
		RESETREALTIMER;
#endif
		free(expData);

		/* let's give a feedback for the timeout */
		t.duration = ((timeStop - timeStart)/1000000.0);
		t.size = 0;
		if (doFork == 1 && *childToParent != NO_SOCKET) {
			/* send a message */
			t.address = addr.addr;
			if (!SendMessageAndData(*childToParent, 
					TIMEOUT_FEEDBACK,
					&t,
					timeoutDescriptor,
					timeoutDescriptorLength,
					-1)) {
				LOG("DoTcpExp: failed to send TIMEOUT_FEEDBACK\n");
			}
		} else {
			/* let's call the function directly */
			SetTimeOut(t.type, addr, t.duration, t.size, t.timedOut);
		}
		name = IPAddressImage_r(addr);
		WARN1("DoTcpExp: latency with %s failed\n", name);
		free(name);
		return 0;
	}

	/* store the result */
	results[1] = (timeStop - timeStart)/1000.0;

	/* ... then bandwidth. */
	bytesTotal = 0;
	bytesThisCall = 0;

	/* let's start the timer */
	timeStart = MicroTime();
	for (failed = 0; bytesTotal < sizes->expSize && !failed; bytesTotal += bytesThisMessage) {
		for(bytesThisMessage = 0; bytesThisMessage < sizes->msgSize && !failed; bytesThisMessage += bytesThisCall) {
	      		/* get the # of bytes to be read/sent */
      			tmp = sizes->msgSize - bytesThisMessage;

			if (sender) {
#ifdef USE_NONBLOCKING
				/* if we are using NONBLOCKING socket
				 * there are way too many things to do
				 * here: so I call Send/RecvBytes
				 * directly */
				bytesThisCall = SendBytes(*peer, expData, tmp, sendTO);
				if (bytesThisCall != tmp) {
					/* something happened: let's get
					 * out of here */
					failed = 1;
				}
#else
				/* we use write instead of send
				 * because they should be the
				 * same when no flags, and write
				 * works with pipes too */
				bytesThisCall = write(*peer, expData, tmp);
#endif
			} else {
#ifdef USE_NONBLOCKING
				/* if we are using NONBLOCKING socket
				 * there are way too many things to do
				 * here: so I call Send/RecvBytes
				 * directly */
				bytesThisCall = RecvBytes(*peer, expData, tmp, recvTO);
				if (bytesThisCall != tmp) {
					/* something happened: let's get
					 * out of here */
					failed = 1;
				}
#else
				/* we use read instead of recv
				 * because they should be the
				 * same when no flags, and write
				 * works with pipes too */
				bytesThisCall = read(*peer, expData, tmp);
#endif
			}
			/* let's check if we failed somewhere */
			if (bytesThisCall <= 0) {
				/* let's get out of here */
				failed = 1;
				break;
			}
		}
	}

	/* if we didn't fail let's finish it up */
	if (!failed && sender) {
		/* Wait for the receiver to close the connection so that
		 * we're sure that all the data has been sent and
		 * received, rather than locally buffered.  */
		FD_ZERO(&peerFDs);
		FD_SET(*peer, &peerFDs);
		timeOut.tv_usec = 0;
		timeOut.tv_sec = (sender ? sendTO : recvTO);

		result = PortabilitySelect(*peer + 1, &peerFDs, NULL, NULL, &timeOut, &bytesTotal);
		if (result < 0) {
			LOG1("DoTcpExp: select got problems %d\n", bytesTotal);
		} else if (result == 0) {
			LOG1("DoTcpExp: select timed out (%ds)\n", timeOut.tv_sec);
		}
	}
	timeStop = MicroTime();
#ifdef USE_ALARM_SIGNAL
	RESETREALTIMER;
#endif

	/* now let's drop the socket, so the remote end can time
	 * correctly */
	DROP_SOCKET(peer);

	/* free memory */
	free(expData);

	/* let's check if we failed somewhere */
	if (failed) {
		char *name; 

		name = IPAddressImage_r(addr);
		WARN2("DoTcpExp: failed bandwidth after %d bytes with %s\n", bytesTotal, name);
		FREE(name);
	} else {
		/* we are measuring Mbit so * 8 / 1000000 */
		results[0] = ( ((double)sizes->expSize * 8.0) / ((double)(timeStop - timeStart) / 1000000.0) ) / 1000000.0;
	}

	/* we want to set the automatic timeout discovery: if we forked
	 * we need to talk to the parent, otherwise we need to call the
	 * function directly */
	t.duration = (timeStop - timeStart)/1000000;
	t.timedOut = failed;
	t.size = bytesTotal;
	 if (doFork == 1 && *childToParent != NO_SOCKET) {
		 /* send the update for the automatic timeout */
		t.address = addr.addr;
		if (!SendMessageAndData(*childToParent, 
					TIMEOUT_FEEDBACK,
					&t,
					timeoutDescriptor,
					timeoutDescriptorLength,
					-1)) {
			LOG("DoTcpExp: failed to send TIMEOUT_FEEDBACK\n");
		}
	 } else {
		 /* let's call the function directly */
		 SetTimeOut(t.type, addr, t.duration, t.size, t.timedOut);
	 }

	return (!failed);
}


/*
 * Handles a request for a TCP experiment received on #sender#. #header#
 * is the header of the message that triggered this (can be NULL) and
 * #port# is the port # to use if a new socket is needed. #childToParent#
 * is a socket (pipe) that we use to communicate with the parent to set
 * the automatic timeouts. Returns 1 if * successful, 0 otherwise. */
int
InitiateTcpExp(IPAddress machine,
               unsigned short port,
               MessageType requestMessage,
               const TcpBwCtrl *sizes,
               double timeOut,
               double *results,
	       Socket *childToParent) {

	TcpHandshake handshake;
	Socket serverSock, extra;
	int returnValue;
	char *tmp;

	tmp = IPAddressMachine_r(machine);
	if (tmp == NULL) {
		WARN("InitiateTcpExp: cannot resolve to name: trying IP address!\n");
		tmp = IPAddressImage_r(machine);
		if (tmp == NULL) {
			WARN("InitiateTcpExp: cannot resolve address to IP!\n");
			tmp = strdup("unknown");
			if (tmp == NULL) {
				ABORT("InitiateTcpExp: out of memory\n");
			}
		}
	}
	LOG5("TCP(%d, %d, %d) to %s:%d\n", sizes->expSize, sizes->bufferSize, sizes->msgSize, tmp, port);

	if (CallAddrBuff(machine, port, sizes->bufferSize, &serverSock, GetTimeOut(CONN, machine, 0))) {
		/* just to be sure we are not using it simewhere else */
		SocketInUse(serverSock);
	} else {
		LOG1("InitiateTcpExp: failed to connect to %s\n", tmp);
		FREE(tmp);
		return 0; 
	}
	FREE(tmp);

	if(!SendMessageAndData(serverSock,
                         requestMessage,
                         sizes,
                         tcpBwCtrlDescriptor,
                         tcpBwCtrlDescriptorLength,
                         timeOut)) {
		DROP_SOCKET(&serverSock);
		WARN1("InitiateTcpExp: request send failed on %d\n", serverSock);
		return 0;
	}

	/*
	 * Note: the server may have to set up a new listening port
	 * before it can respond, so should we use a generous time-out
	 * value here?
	 */
	if(!RecvMessageAndData(serverSock,
                         TCP_HANDSHAKE,
                         &handshake,
                         tcpHandshakeDescriptor,
                         tcpHandshakeDescriptorLength,
                         timeOut)) {
		DROP_SOCKET(&serverSock);
		WARN1("InitiateTcpExp: receive of handshake failed on %d\n", serverSock);
		return 0;
	}

	extra = NO_SOCKET;
	returnValue = 0;
	if(handshake.portNum != REUSE_CONNECTION) {
		/* Reestablish the connection on the port contained in
		 * the handshake. */
		extra = serverSock;		/* save the original socket */
		serverSock = NO_SOCKET;


		if(CallAddrBuff((!handshake.ipAddr.addr) ? machine : handshake.ipAddr,
				  handshake.portNum, 
				  sizes->bufferSize, 
				  &serverSock, 
				  timeOut)) {
			/* socket is in use */
			SocketInUse(serverSock);
		} else {
			/* drop old socket */
			DROP_SOCKET(&extra);
			return 0;
		}
	}

	returnValue = DoTcpExp(&serverSock, sizes, 1, results, childToParent);
	DROP_SOCKET(&extra);

	return returnValue;

}



/*
 * Contacts the server listening to #machine#:#port#, sends
 * #requestMessage#, and conducts a TCP latency and bandwidth
 * experiments. #childToParent# is a socket (pipe) to the parent used to
 * communicate the data needed to set the automatic timeout.
 * If successful within #timeOut# seconds, returns 1 and
 * sets results[0] and results[1] to the obvserved bandwidth
 * (megabits/second) and latency (milliseconds); otherwise, returns 0. */

int
TerminateTcpExp(	MessageHeader *header,
			Socket *sender,
			int portToUse,
			Socket *childToParent) {
	double ignored[2];
	unsigned int bufferSize;
	int returnValue, tmp;
	struct timeval timeOut;
	struct sockaddr_in client;
	char *c;
	SOCKLEN_T bufferSizeSize = sizeof(bufferSize);
	SOCKLEN_T clientSize = sizeof(client);
	Socket clientSock;
	Socket earSock;
	fd_set earFD;
	TcpHandshake handshake;
	TcpBwCtrl sizes;

	if(!RecvData(*sender,
       			&sizes,
			tcpBwCtrlDescriptor,
			tcpBwCtrlDescriptorLength,
			-1)) {
		WARN1("TerminateTcpExp: receive failed on %d\n", *sender);
		return 0;
	}

	/* we got here a dotted notation becasue we don't want to go to
	 * DNS and get stuck there */
	c = PeerName_r(*sender);
	if (c != NULL) {
		LOG4("Servicing TCP(%d, %d, %d) from %s\n", sizes.expSize, sizes.bufferSize, sizes.msgSize, c);
		FREE(c);
	} else {
		LOG3("Servicing TCP(%d, %d, %d)\n", sizes.expSize, sizes.bufferSize, sizes.msgSize);
	}

	/* The following statement causes problems if we're behind a
	 * firewall. */
	/* getsockname(*sender, (struct sockaddr *)&myEnd, &myEndSize); */
	/* handshake.ipAddr = myEnd.sin_addr.s_addr; */
	handshake.ipAddr.addr = 0;

	if (getsockopt(*sender, SOL_SOCKET, SO_RCVBUF, (char *)&bufferSize, &bufferSizeSize) < 0) {
		WARN1("TerminateTcpExp: getsockopt failed %d\n", errno);
		bufferSize = 0;
	}

	if ((bufferMultiply * sizes.bufferSize) == bufferSize) {
		/* We can reuse the connection for the experiment. */
		handshake.portNum = REUSE_CONNECTION;
		if(!SendMessageAndData(*sender,
                           TCP_HANDSHAKE,
                           &handshake,
                           tcpHandshakeDescriptor,
                           tcpHandshakeDescriptorLength,
                           -1)) {
			WARN1("TerminateTcpExp: send of reply failed on %d\n", *sender);
			return 0;
		}
		returnValue = DoTcpExp(sender, &sizes, 0, ignored, childToParent);
	} else {
		/* Establish a listening port with the experiment buffer
		 * size. */
		/* let's see if we need to use a specific port */
		if (portToUse > 0) {
			tmp = portToUse;
		} else {
			tmp = 0;
		}
		if (!EstablishAnEarBuff(tmp, tmp, &earSock, &handshake.portNum, sizes.bufferSize)) {
			FAIL("TerminateTcpExp: failed to open ear.\n");
		}
		/* don't use this socket for anything else */
		SocketInUse(earSock);

		/* Tell the client what port to contact, then wait for
		 * them to respond. */
		if(!SendMessageAndData(*sender,
                           TCP_HANDSHAKE,
                           &handshake,
                           tcpHandshakeDescriptor,
                           tcpHandshakeDescriptorLength,
                           -1)) {
			DROP_SOCKET(&earSock);
			WARN1("TerminateTcpExp: send of reply failed on %d\n", *sender);
			return 0;
		}

		/* Give the client 10 seconds to connect. */
		FD_ZERO(&earFD);
		FD_SET(earSock, &earFD);
		timeOut.tv_sec = 10;
		timeOut.tv_usec = 0;

		if (PortabilitySelect(earSock + 1, &earFD, NULL, NULL, &timeOut, &returnValue) <= 0) {
			WARN1("TerminateTcpExp: select failed %d\n", returnValue);
		} else if ((clientSock = accept(earSock, (struct sockaddr *)&client, &clientSize)) < 0) {
			returnValue = errno;
			WARN1("TerminateTcpExp: accept failed %d\n", returnValue);
		} else {
			returnValue = DoTcpExp(&clientSock, &sizes, 0, ignored, childToParent);
		}
		DROP_SOCKET(&earSock);
	}
	/* done here */
	DROP_SOCKET(sender);

	return returnValue;
}

/* wrap around TerminateTcpExp: needed in case we need to fork */
static void
HandleMessage(	Socket *sd,
		MessageHeader header) {
	pid_t pid = 0;
	int ret;
	Socket childToParent;
	timeoutStruct t;
	IPAddress addr;
	
	childToParent = NO_SOCKET;
	ret = 0;

	switch (header.message) {
	case TCP_BW_REQ:
		if (doFork) {
			if(!CreateLocalChild(&pid, NULL, &childToParent)) {
				ERROR("HandleMessage: fork failed.\n");
				return;
			}
			if(pid > 0) {
				/* Parent process. */
				PassSocket(sd, pid);
				return;
			}
			/* child process */
		}
                /* TerminateTcpExp should drop the socket at the end and
		 * print error message if something goes wrong */
		ret = TerminateTcpExp(&header, sd, myPort, &childToParent);
		break;

	case TIMEOUT_FEEDBACK:
		/* pretty nasty stuff: the childreen are the one to do
		 * the network experiment, hence are the one to be able
		 * to adjust the automatic timeout, but it is the parent
		 * that needs to do it (the childreen will exit soon
		 * after the experiment). They generate this message that
		 * feed back the necessary data to call SetTimeOut. */
		if (!RecvData(*sd, &t, timeoutDescriptor, timeoutDescriptorLength, -1)) {
			DROP_SOCKET(sd);
			ERROR("HandleRequests data receive failed\n");
		} else {
			/* set the right timeout */
			addr.addr = t.address;
			SetTimeOut(t.type, addr, t.duration, t.size, t.timedOut);
		}
		break;
	}

	if (doFork && (header.message == TCP_BW_REQ)) {
		DROP_SOCKET(&childToParent);
		exit(!ret);
	}
}


/* we use it as initialization function */
int
TcpLtBwAvailable(const char *options) {
	static int initialized = 0;
	Socket sd;
	char *tmp;

	/* we initialized only once */
	if (initialized) {
		return 1;
	}

	/* let's see if we have ben asked to fork or not */
	tmp = GetOptionValue(options, "fork", NULL);
	if (tmp == NULL || strncmp(tmp, "yes", (strlen(tmp) > 3 ? 3 : strlen(tmp))) == 0) {
		doFork = 1;
	} else {
		doFork = 0;
	}
	FREE(tmp);

	/* let's set the port to use for experiment */
	tmp = GetOptionValue(options, "forceport", "-1");
	myPort = strtol(tmp, NULL, 10);
	if (myPort < 0) {
		myPort = 0;
	}
	FREE(tmp);

	/* now we have to test if the OS allocate double the socket
	 * buffer then asked. */
	sd = socket(AF_INET, SOCK_STREAM, 0);
	if (sd >= 0) {
		bufferMultiply = ConditionSocket(sd, 1, 64*1024);
		if (bufferMultiply == 0) {
			bufferMultiply = 1;
		}
		DROP_SOCKET(&sd);
	}
	
	/* register the function to handle the TCP_BW_REQ message */
	RegisterListener(TCP_BW_REQ, "TCP_BW_REQ", &HandleMessage);
	RegisterListener(TIMEOUT_FEEDBACK, "TIMEOUT_FEEDBACK", &HandleMessage);
	initialized = 1;

	return 1;
}

#define ONE_K (1024)
void
TcpLtBwUseSkill(	const char *options, 
			int *length, 
			SkillResult **results) {
	TcpBwCtrl sizes;
	const char *c;
	char opts[255 + 1],
		name[MAX_MACHINE_NAME + 1],
		*tmp;
	double res[2], timeout;
	int ret;
	struct host_cookie host;
	IPAddress address;
	Socket childToParent;

	/* get the default options */
	tmp = GetOptionValue(options, "timeout", "-1"); 
	timeout = strtod(tmp, NULL);
	FREE(tmp);
	tmp = GetOptionValue(options, "size", "256");
	sizes.expSize = strtol(tmp, NULL, 10) * ONE_K;
	FREE(tmp);
	tmp = GetOptionValue(options, "buffer", "32");
	sizes.bufferSize = strtol(tmp, NULL, 10) * ONE_K;
	FREE(tmp);
       	tmp = GetOptionValue(options, "message", "16");
       	sizes.msgSize = strtol(tmp, NULL, 10) * ONE_K;
	FREE(tmp);

	/* get the socket for the child to parent communication */
       	tmp = GetOptionValue(options, "childToParent", "-1");
       	childToParent = strtol(tmp, NULL, 10);
	FREE(tmp);

       	/* Conduct a message experiment with each host in the
       	* target option. */
       	tmp = GetOptionValue(options, "target", "");
       	for(c = tmp; GETTOK(name, c, ",", &c);) {
		Host2Cookie(name, DefaultHostPort(SENSOR_HOST), &host);
		if (!IPAddressValue(host.name, &address)) {
			WARN("TcpLtBwUseSkill: couldn't resolve hostname\n");
			ret = 0;
		} else {
			sprintf(opts, "buffer:%d\tmessage:%d\tsize:%d\ttarget:%s", sizes.bufferSize / ONE_K, sizes.msgSize / ONE_K, sizes.expSize / ONE_K, HostCImage(&host));
			ret = InitiateTcpExp(address, host.port, TCP_BW_REQ, &sizes, timeout, &res[0], &childToParent);
		}
		AppendResult(bandwidthTcp, opts, ret, res[0], 0, length, results);
		AppendResult(latencyTcp, opts, ret, res[1], 0, length, results);
	}
	FREE(tmp);
}

