package ir.webutils;

import java.util.*;
import ir.utilities.*;

/**
 * A spider that uses heuristic beam search to find a web page that
 * contains a set of "want strings" using a set of "help strings"
 * to guide the search.  Conducts a search through a space of ScoredAnchoredLinks
 * to find a page that satisfies the goal, i.e. contains all of the "want strings".
 *
 * @author Ray Mooney */

public class BeamSearchSpider extends Spider {

    /** Defines the goal predicate over HTMLPage's that is to be satisfied. */
    protected PageGoal goal;

    /** Defines the heuristic that is used to sort ScoredAnchoredLink's in the queue */
    protected LinkHeuristic heuristic;

    /** The beam width to use.  Size of queue is kept to the best beamSize links. */
    protected int beamSize = 100;

    /** The page found that satisfies the goal */
    protected HTMLPage goalPage = null;

    /**
     * Interprets command line arguments and performs the crawl. 
     * Determines if goal page was found and if so displays it using
     * Browser and prints path to goal page from start URL.
     *
     * @param args Command line arguments.  */
    public void go(String[] args) {
	processArgs(args);
	if (heuristic == null || heuristic.wantStrings == null) {
	    System.out.println("Error: No want strings specified.");
	    return;
	}
	System.out.print("\nSearch for: ");
	for(int i = 0; i < heuristic.wantStrings.length; i++) 
	    System.out.print("\"" + heuristic.wantStrings[i] + "\" ");
	System.out.print("\nHelped by: ");
	for(int i = 0; i < heuristic.helpStrings.length; i++) 
	    System.out.print("\"" + heuristic.helpStrings[i] + "\" ");
	System.out.println("");
	doCrawl();
	if (goalPage == null) 
	    System.out.println("\nGoal page not found");
	else {
	    // Print path from start URL to goal
	    System.out.println("\nGoal Page found.  Path from start URL is:");
	    printPath(goalPage.getLink());
	    // Display goal page in Browser
	    Browser.display(goalPage.getLink().getURL().toString());
	}
    }

    /**
     * Processes command-line arguments.  <p> The following options are
     * handled by this function: 
     * <ul> 
     * <li>-safe : Check for and obey robots.txt and robots META tag directives.</li> 
     * <li>-c &lt;maxCount&gt; : Download at most &lt;maxCount&gt; pages.</li> 
     * <li>-u &lt;url&gt; : Start at &lt;url&gt;.</li>
     * <li>-w &lt;strings&gt; : &lt;strings&gt; should be a list of "need strings" separated by ";"'s.</li>
     * <li>-h &lt;strings&gt; : &lt;strings&gt; should be a list of "help strings" separated by ";"'s.</li>
     * <li>-b &lt;size&gt; : Use a beam width of given &lt;size&gt;</li> 
     * <li>-slow : Pause briefly before getting a page.  This can be useful when debugging.
     * </ul>
     * 
     * Each option has a corresponding
     * <code>handleXXXCommandLineOption</code> function that will be
     * called when the option is found.  Subclasses may find it
     * convenient to change how options are handled by overriding
     * those methods instead of this one.  Only the above options will
     * be dealt with by this function, and the input array will remain
     * unchanged.  Note that if the flag for an option appears in the
     * input array, any value associated with that option will be
     * assumed to follow.  Thus if a "-c" flag appears in
     * <code>args</code>, the next value in <code>args</code> will be
     * blindly treated as the count.
     *
     * @param args Array of arguments as passed in from the command
     * line.  */
    public void processArgs(String[] args) {
	int i = 0;
	while (i < args.length) {
	    if (args[i].charAt(0) == '-') {
		if (args[i].equals("-safe"))
		    handleSafeCommandLineOption();
		else if (args[i].equals("-c"))
		    handleCCommandLineOption(args[++i]);
		else if (args[i].equals("-u")) 
		    handleUCommandLineOption(args[++i]);
		else if (args[i].equals("-w"))
		    handleWCommandLineOption(args[++i]);
		else if (args[i].equals("-h"))
		    handleHCommandLineOption(args[++i]);
		else if (args[i].equals("-b"))
		    handleBCommandLineOption(args[++i]);
		else if (args[i].equals("-slow"))
		    handleSlowCommandLineOption();
	    }
	    ++i;
	} 
    }

    /**
     * Called when "-u" is passed in on the command line.  <p> This
     * implementation adds <code>value</code> to the list of links to
     * visit.  This version creates an initial ScoredAnchoredLink.
     *
     * @param value The value associated with the "-u" option. */
    protected void handleUCommandLineOption(String value) {
	linksToVisit.add(new ScoredAnchoredLink(value));
    }

    /**
     * Called when "-w" is passed in on the command line to
     * set "want strings".
     * Sets "want strings" for the search by parsing the value
     * into an array of want strings using ";" as a separator.
     * Uses result to initialize goal and heuristic.
     */
    protected void handleWCommandLineOption(String value) {
	String[] wantStrings = MoreString.segmentToArray(value, ';');
	// Set goal based on wantStrings
	goal = new PageGoal(wantStrings);
	// Set heuristic based on wantStrings
	if (heuristic == null)
	    heuristic = constructLinkHeuristic();
	heuristic.wantStrings = wantStrings;
    }


    /**
     * Called when "-h" is passed in on the command line to set
     * help strings.
     * Sets "help strings" for the search by parsing the value
     * into an array of help strings using ";" as a separator.
     * Uses result to initialize heuristic.
     */
    protected void handleHCommandLineOption(String value) {
	if (heuristic == null)
	    heuristic = constructLinkHeuristic();
	heuristic.helpStrings = MoreString.segmentToArray(value, ';');
    }

    /** 
     * Return default LinkHeuristic.  Specializations can override
     * this method to utilize alternate link heuristics.
     */
    protected LinkHeuristic constructLinkHeuristic() {
	return new LinkHeuristic();
    }

    /**
     * Called when "-b" is passed in on the command line to
     * sets beam width.
     */
    protected void handleBCommandLineOption(String value) {
	beamSize = Integer.parseInt(value);
    }

    /** 
     * Crawls the web using beam search with given heuristic to 
     * find a page that satisfies goal.  Sets goalPage if successful.
     */
    public void doCrawl() {
	// Initialize set of visited pages and goalPage to empty
	visited = new HashSet<Link>();
	goalPage = null;
	// Search until queue is empty or maxCount exceeded
	while (linksToVisit.size() > 0 && count < maxCount) {
	    // Pause if in slow mode
	    if (slow) {
		synchronized (this) {
		    try {
			wait(1000);
		    }
		    catch (InterruptedException e) {
		    }
		}
	    }
	    // Take the top link off the queue
	    ScoredAnchoredLink link = (ScoredAnchoredLink) linksToVisit.remove(0);
	    count++;
	    System.out.println("\nExpanding" + "(" + count +"): " + link + "\nScore: " + link.score);
            // Skip if already visited this page
	    if (!visited.add(link)) {
		System.out.println("Already visited");
		continue;
	    }
	    if (!linkToHTMLPage(link)) {
		System.out.println("Not HTML Page");
		continue;
	    }
	    HTMLPage currentPage = null;
	    // Use the page retriever to get the page
	    try {
		currentPage = retriever.getHTMLPage(link);
	    }
	    catch (PathDisallowedException e) {
		System.out.println(e);
		continue;
	    }
	    if (currentPage.empty()) {
		System.out.println("No Page Found");
		continue;
	    }
	    // If goal page found, set goalPage and exit.
	    if (goal.satisfiedBy(currentPage)) {
		goalPage = currentPage;
		return;
	    }
	    if (count < maxCount) {
		List<Link> newLinks = getNewLinks(currentPage);
		// Score the new links based on the heuristic.
		scoreLinks(newLinks, currentPage);
		// Add new links to end of queue
		linksToVisit.addAll(newLinks);
		// Sort the queue in order of heuristic quality
		Collections.sort(linksToVisit);
		// If queue is too long, trim it back to the first beamSize elements
		if (linksToVisit.size() > beamSize)
		    linksToVisit.subList(beamSize, linksToVisit.size()).clear();
	    }
	}
 }

    /**
     * Returns a list of scored links to follow from a given page.
     *
     * @param page The current page.
     *
     * @return Links to be visited from this page */
    protected List<Link> getNewLinks(HTMLPage page) {
	return 	new ScoredAnchoredLinkExtractor(page).extractLinks();
    }


    /** Use the heuristic to score each of the new links on a given page that
     * was expanded.
     */
    protected void scoreLinks(List<Link> links, HTMLPage page) {
	Iterator<Link> iterator = links.iterator();
	while(iterator.hasNext()) {
	    ScoredAnchoredLink link = (ScoredAnchoredLink)iterator.next();
	    link.score = heuristic.scoreLink(link, page);
	    // System.out.println("Score:" + link + " = " + link.score);
	}
    }

    /** Print the path from the start URL to this link */
    void printPath(Link link) {
	printPath((ScoredAnchoredLink)link);
    }

    /** Use recursion to print the path from the start URL to this link */
    void printPath(ScoredAnchoredLink link) {
	if (link.getBackLink() != null)
	    printPath(link.getBackLink());
	System.out.println("  " + link);
    }
	

    /** Search the web using beam search according to the following command options:
     * <ul> 
     * <li>-safe : Check for and obey robots.txt and robots META tag directives.</li> 
     * <li>-c &lt;maxCount&gt; : Download at most &lt;maxCount&gt; pages (default is 10,000).</li> 
     * <li>-u &lt;url&gt; : Start at &lt;url&gt;.</li>
     * <li>-w &lt;strings&gt; : &lt;strings&gt; should be a list of "need strings" separated by ";"'s.</li>
     * <li>-h &lt;strings&gt; : &lt;strings&gt; should be a list of "help strings" separated by ";"'s.</li>
     * <li>-b &lt;size&gt; : Use a beam width of given &lt;size&gt; (default is 100)</li> 
     * <li>-slow : Pause briefly before getting a page.  This can be useful when debugging.
     * </ul>
     */
    public static void main(String args[]) {
	new BeamSearchSpider().go(args);
    }

}
