package ir.webutils;

import ir.utilities.*;

/**
 * Evaluates a web link (ScoredAnchoredLink) based on satisfying a set
 * of "want strings" and "help strings". 
 * <p> The existing search heuristic considers four factors in order of importance:
 * <ol>
 * <li> (WC) The number of the want-strings that are found
 * <li> (WT) The total number of times a want-string is found
 * <li> (HC) The number of the help-strings that are found
 * <li> (HT) The total number of times a help-string is found
 * </ol>
 * A page is scored as <br><br> S = 1000*WC + 100*HC + 10*WT + HT <br><br> A link
 * is scored partly based on the text appearing directly in the link and partly
 * based on the surrounding page. If L is the S score for the text in the link and
 * P is the S score for the overall page, then a link is scored as <br> <br> L/2 +
 * P/2 <br><br> getting half its score from it's own text and half from its
 * surrounding page.
 *
 * @author Ray Mooney */

public class LinkHeuristic extends Object {

    /** The array of want strings that are desired */
    public String[] wantStrings = new String[0];

    /** The array of help strings to help find the want strings */
    public String[] helpStrings = new String[0];
    
    /** Construct an empty heuristic */
    public LinkHeuristic() { }

    /** Construct a heuristic with the given wantStrings and helpStrings */
    public LinkHeuristic(String[] wantStrings, String[] helpStrings) {
	this.wantStrings = wantStrings;
	this.helpStrings = helpStrings;
    }

    /** Heuristically score the given link appearing on the given page */
    public double scoreLink(ScoredAnchoredLink link, HTMLPage page) {
	// The score for the source page is cached in the pageScore of the
	// backLink of the current link so that it does not need to be
	// computed more than once.
	if (link.getBackLink().pageScore == -1)
	    link.getBackLink().pageScore = scoreString(page.getText());
	// A link gets half is score from its anchor text and half from the
	// surrounding page.
	return 0.5 * link.getBackLink().pageScore + 0.5 * scoreString(link.getAnchorText());
	
    }

    /** Score how well a piece of text matches the want and help strings.
     * (see file header comments for explanation) */
    double scoreString(String text) {
	StringSearchResult wantStringResult = searchStrings(wantStrings, text);
	StringSearchResult helpStringResult = searchStrings(helpStrings, text);
	return 1000 * wantStringResult.numberFound + 100 * wantStringResult.numberOccurrences +
	    10 * helpStringResult.numberFound + helpStringResult.numberOccurrences;
    }

    /** Determine how many of an array of strings occur in a piece of text and 
	how many total occurrences there are of these strings. */
    StringSearchResult searchStrings(String[] strings, String text) {
	// Number of the different strings that are found in the text
	int numberFound = 0;
	// Total number of occurrences of any of these strings in the text
	int numberOccurrences = 0;
	// Search for each of the strings
	for(int i = 0; i < strings.length; i++) {
	    // Count the number of times it occurs in the text
	    int count = MoreString.countPhrase(text, strings[i]);
	    // If occurs at least once, increment numberFound
	    if (count > 0)
		numberFound++;
	    // Keep running total number of occurrences
	    numberOccurrences = numberOccurrences + count;
	}
	return new StringSearchResult(numberFound, numberOccurrences);
    }

}
    
	
