package ir.webutils;

import java.util.*;
import java.net.*;

/** 
 * A BeamSearchSpider that limits itself to a given site (web host).
 *
 * @author Ray Mooney */
public class BeamSearchSiteSpider extends BeamSearchSpider {

    /**
     * Gets links from the given page that are on the same host as the
     * page.
     *
     * @return A list of links on <code>page</code> that have the same
     * host as <code>url</code>.  */
    public List<Link> getNewLinks(HTMLPage page) {
	List<Link> links = new ScoredAnchoredLinkExtractor(page).extractLinks();
	URL url = page.getLink().getURL();
	ListIterator<Link> iterator = links.listIterator();
	while(iterator.hasNext()) {
	    Link link = (Link) iterator.next();
	    if(!url.getHost().equals(link.getURL().getHost()))
		iterator.remove();
	}
	return links;	
    }
   
    /** Search the web using beam search according to the following command options, 
     * but stay within the initial host site.
     * <ul> 
     * <li>-safe : Check for and obey robots.txt and robots META tag directives.</li> 
     * <li>-c &lt;maxCount&gt; : Download at most &lt;maxCount&gt; pages.</li> 
     * <li>-u &lt;url&gt; : Start at &lt;url&gt;.</li>
     * <li>-w &lt;strings&gt; : &lt;strings&gt; should be a list of "need strings" separated by ";"'s.</li>
     * <li>-h &lt;strings&gt; : &lt;strings&gt; should be a list of "help strings" separated by ";"'s.</li>
     * <li>-b &lt;size&gt; : Use a beam width of given &lt;size&gt;</li> 
     * <li>-slow : Pause briefly before getting a page.  This can be useful when debugging.
     * </ul>
     */
    public static void main(String args[]) {
	new BeamSearchSiteSpider().go(args);
    }
}









