/*
    BFilter - a smart ad-filtering web proxy
    Copyright (C) 2002-2005  Joseph Artsimovich <joseph_a@mail.ru>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include "pch.h"

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include "HeuristicScore.h"
#include "URI.h"
#include "ArraySize.h"
#include "StringUtils.h"
#include "InsensitiveEqual.h"
#include "BString.h"
#include "SplittableBuffer.h"
#include "SBOutStream.h"
#include "GlobalState.h"
#include "UrlPatterns.h"
#include <stddef.h>
#include <string>
#include <cctype>
#include <cstdlib>
#include <algorithm>

using namespace std;

struct HeuristicScore::SizeRecord
{
	int width;
	int height;
	bool is_ad;
	int score;
};

struct HeuristicScore::SizeRecordComparator
{
	enum { MAX_DEVIATION = 2 };
	
	SizeRecordComparator() {}
	
	bool operator()(SizeRecord const& lhs, SizeRecord const& rhs) {
		if (abs(lhs.width - rhs.width) > MAX_DEVIATION) {
			return lhs.width < rhs.width;
		} else if (abs(lhs.height - rhs.height) > MAX_DEVIATION) {
			return lhs.height < rhs.height;
		}
		return false;
	}
};


HeuristicScore::SizeRecord const HeuristicScore::m_sCommonSizes[] = {
	// sorted by width, height
	{ 88,  31,  true, 80  },
	{ 100, 100, false, 30 },
	{ 120, 60,  true, 100 },
	{ 120, 90,  false, 30  },
	{ 120, 120, false, 30 },
	{ 120, 240, true, 120 },
	{ 120, 300, true, 120 },
	{ 120, 600, true, 120 },
	{ 125, 125, false, 30 },
	{ 160, 600, true, 120 },
	{ 180, 150, true, 100 },
	{ 234, 60,  true, 120 },
	{ 234, 120, true, 120 },
	{ 240, 400, true, 120 },
	{ 250, 250, true, 100 },
	{ 300, 250, true, 100 },
	{ 336, 280, true, 120 },
	{ 468, 60,  true, 120 },
	{ 468, 80,  true, 120 },
	{ 728, 90,  true, 120 }
};


HeuristicScore::Status
HeuristicScore::getStatus() const
{
	int score = getNumericScore();
	if (score < 20) {
		return NOT_AD;
	} else if (score < 60) {
		return PROBABLY_NOT_AD;
	} else if (score < 100) {
		return PROBABLY_AD;
	} else {
		return AD;
	}
}

HeuristicScore::UrlStatus
HeuristicScore::getUrlStatus(URI const& url)
{
	BString path = url.getDecodedPath();
	BString query = url.getDecodedQuery();
	SBOutStream strm(path.size() + 1 + query.size());
	strm << path << '?' << query;
	SplittableBuffer path_plus_query;
	strm.swapData(path_plus_query);
	BString path_plus_query2(path_plus_query.toBString());
	
	int num_equals = 0;
	int num_slashes = 0;
	int num_digits = 0;
	int num_digit_groups = 0;
	int max_digit_group_size = 0;
	
	{
		int group_size = 0;
		char const* p = path_plus_query2.begin();
		char const* const end = path_plus_query2.end();
		while (p != end) {
			for (; p != end && !isdigit(*p); ++p) {
				if (*p == '=') {
					++num_equals;
				} else if (*p == '/') {
					++num_slashes;
				}
			}
			for (; p != end && isdigit(*p); ++p) {
				++num_digits;
				++group_size;
			}
			if (group_size > 2) { // <= 2 digits is not considered to be a group
				++num_digit_groups;
				if (group_size > max_digit_group_size) {
					max_digit_group_size = group_size;
				}
			}
			group_size = 0;
		}
	}
	
	if (!url.hasQuery() && num_equals == 0 && num_slashes < 3 && num_digits < 6
	    && num_digit_groups < 2 && max_digit_group_size <= 4) {
		// a URL like: http://somehost.com/1024x768/ would be innocent 
		return URL_INNOCENT;
	}
	
	if (num_digit_groups >= 3 /*|| max_digit_group_size > 4*/ || num_equals > 3) {
		return URL_SUSPICIOUS;
	}
	
	if (!path_plus_query.find(BString("http://")).isAtRightBorder()) {
		return URL_SUSPICIOUS;
	}
	
	return URL_MEDIOCRE;
}

HeuristicScore::UrlRelationship
HeuristicScore::getUrlRelationship(URI const& url, URI const& base)
{
	if (!url.isAbsolute()) {
		return URLS_SAME_HOST;
	}
	if (!InsensitiveEqual()(url.getScheme(), BString("http")) &&
	    !InsensitiveEqual()(url.getScheme(), BString("https"))) {
		// this will catch javascript: and about: urls
		return URLS_RELATED; 
	}

	BString const& host1 = url.getHost();
	BString const& host2 = base.getHost();
	
	return getDomainRelationship(
		host1.begin(), host1.end(), host2.begin(), host2.end()
	);
}

bool
HeuristicScore::isCommonAdSize(int width, int height)
{
	SizeRecord const* rec = findSizeRecord(width, height);
	return rec && rec->is_ad;
}

int
HeuristicScore::getSizeScore(int width, int height)
{
	if ((width != -1 && width < 30) || (height != -1 && height < 15)) {
		// too small
		return -100;
	}
	
	SizeRecord const* rec = findSizeRecord(width, height);
	return rec ? rec->score : 0;
}

int
HeuristicScore::getHintModifier(URI const& url)
{
	return GlobalState::ReadAccessor()->urlPatterns().getHintFor(url) * 10;
}

HeuristicScore::UrlRelationship
HeuristicScore::getDomainRelationship(
	char const* d1_begin, char const* d1_end,
	char const* d2_begin, char const* d2_end)
{
	chopLeadingWWW(d1_begin, d1_end);
	chopLeadingWWW(d2_begin, d2_end);
	
	if (StringUtils::ciEqual(d1_begin, d1_end, d2_begin, d2_end)) {
		return URLS_SAME_HOST;
	}
	
	// tomshardware.com vs tomshardware.de should return URLS_RELATED
	// Note that we consider a domain like .co.uk to be a toplevel one. 
	chopTopLevelDomain(d1_begin, d1_end);
	chopTopLevelDomain(d2_begin, d2_end);
	
	char const* p1 = d1_end;
	char const* p2 = d2_end;
	char const* p1_dot = d1_end;
	
	// walk from right to left while the characters are the same
	for (; p1 != d1_begin && p2 != d2_begin &&
	     tolower(static_cast<unsigned char>(p1[-1])) ==
	     tolower(static_cast<unsigned char>(p2[-1])); --p1, --p2) {
		if (p1[-1] == '.') {
			p1_dot = p1;
		}	
	}
	
	if ((p1 != d1_begin && p1[-1] != '.') || (p2 != d2_begin && p2[-1] != '.')) {
		// at least one of the positions is in the middle of a subdomain
		p1 = p1_dot;
	}
	
	return (p1 == d1_end ? URLS_UNRELATED : URLS_RELATED);
}

void
HeuristicScore::chopLeadingWWW(char const*& begin, char const* end)
{
	static char const www[] = {'w','w','w','.'};
	static char const* www_end = www + ARRAY_SIZE(www);
	
	if (StringUtils::ciStartsWith(begin, end, www, www_end)) {
		begin += ARRAY_SIZE(www);
	}
}

void
HeuristicScore::chopTopLevelDomain(char const* begin, char const*& end)
{
	char const* p1 = end;
	for (; p1 != begin && p1[-1] != '.'; --p1) {
		// search for a rightmost dot
	}
	if (p1 == begin) {
		// a dot wasn't found
		return;
	}
	
	if (end - p1 == 2) {
		char const* p2 = p1 - 1;
		for (; p2 != begin && p2[-1] != '.'; --p2) {
			// search for the next dot
		}
		if (end - p2 == 5 && p2 != begin) {
			// We consider domains like .co.uk to be toplevel
			end = p2 - 1; // p2 points *past* the dot
			return;
		} else if (end - p2 == 6 && p2 != begin) {
			// Some countries have domain system like .com.au, .org.au, etc
			BString const empty;
			BString l2_domain(empty, p2, p2 + 3);
			InsensitiveEqual ieq;
			if (ieq(l2_domain, BString("com")) ||
			    ieq(l2_domain, BString("org")) ||
			    ieq(l2_domain, BString("net"))) {
				// As for gov and mil, not handling them here
				// results in host1.gov.au and host2.gov.au
				// to be marked as URLS_RELATED, which seems fair.
				end = p2 - 1; // p2 points *past* the dot
				return;
			}
		}
	}
	
	end = p1 - 1; // p1 points *past* the dot
}

HeuristicScore::SizeRecord const*
HeuristicScore::findSizeRecord(int width, int height)
{
	SizeRecord rec = { width, height };
	SizeRecordComparator comp;
	SizeRecord const* end = m_sCommonSizes + ARRAY_SIZE(m_sCommonSizes);
	SizeRecord const* p = std::lower_bound(m_sCommonSizes, end, rec, comp);
	if (p != end && !comp(rec, *p)) {
		return p;
	}
	return 0;
}
