import java.io.*;
import java.net.*;
import java.text.*;
import java.util.*;
import java.util.Scanner;

class SparseMatrix {
    int m, n;
    int count;

    int colPtrs[];
    int rowIndices[];
    float val[];

    public SparseMatrix(Reader dim, Reader col, Reader row, Reader nz) { //modified
        StreamTokenizer tok = new StreamTokenizer(new BufferedReader(dim));
        try {
            tok.nextToken();
            m = (int)tok.nval;
            tok.nextToken();
            n = (int)tok.nval;
            tok.nextToken();
            count = (int)tok.nval;
	}catch(IOException e){
            System.err.println("Ill formed matrix file: dim");}
            //System.out.println("count="+count+" m="+m+" n="+n);
	tok = new StreamTokenizer(new BufferedReader(col));
	try{
            colPtrs = new int[n+1];
            rowIndices = new int[count];
            val = new float[count];
            for(int i=0; i < n+1; i++) {
                tok.nextToken();
                colPtrs[i] = (int)tok.nval;
            }
	    }catch(IOException e){
            System.err.println("Ill formed matrix file: col");}
	tok = new StreamTokenizer(new BufferedReader(row));
	try{
            for(int i=0; i < count; i++) {
                tok.nextToken();
                rowIndices[i] = (int)tok.nval;
            }
	    }catch(IOException e){
            System.err.println("Ill formed matrix file: row");}
	tok = new StreamTokenizer(new BufferedReader(nz));
	try{
            for(int i=0; i < count; i++) {
                tok.nextToken();
                val[i] = (float)tok.nval;
            }
	}
        catch(IOException e){
            System.err.println("Ill formed matrix file: nz");}
    }
}

class Document {
    String name;
    int words[];
    float wordCounts[];
    double dist;

    public Document(String name, int words[], float wordCounts[]) {
        this.name = name;
        this.words = words;
        this.wordCounts = wordCounts;
    }
}

class ClusterBrowser {
    public static void main(String args[]) {
        if(args.length <= 4) {
            try {
                int pathIndex = args[0].lastIndexOf("/")+1;
 
                String pathStr = args[0].substring(0, pathIndex);               
                String collStr = args[0].substring(pathIndex, args[0].lastIndexOf("_"));
                String scalingStr = args[0].substring(args[0].lastIndexOf("_")+1);
                int numClust = Integer.parseInt(args[1]);
                String documentURL = args[2];
                if(!documentURL.endsWith("/"))
                    documentURL += "/";
		//System.out.println(pathStr+collStr+"_words");

		
		    
                ClusterBrowser cb = new ClusterBrowser();
                Reader clusterIn, nameIn, wordIn, dim, col, row, nz;
                if(pathStr.startsWith("ftp://")) {
                    clusterIn = ftpReader(pathStr+collStr+"_"+scalingStr+
                                      "_doctoclus."+numClust);
                    nameIn = ftpReader(pathStr+collStr+"_docs");
                    wordIn = ftpReader(pathStr+collStr+"_words");
                    dim = ftpReader(pathStr+collStr+"_dim");
		    col= ftpReader(pathStr+collStr+"_col_ccs");
		    row = ftpReader(pathStr+collStr+"_row_ccs");
		    nz = ftpReader(pathStr+collStr+"_"+scalingStr+"_nz");
		    
                }
                else if(pathStr.startsWith("http://")) {
                    URL url = new URL(pathStr+collStr+"_"+scalingStr+
                                      "_doctoclus."+numClust);
                    clusterIn = new InputStreamReader(url.openStream());

                    url = new URL(pathStr+collStr+"_docs");
                    nameIn = new InputStreamReader(url.openStream());

                    url = new URL(pathStr+collStr+"_"+scalingStr+"_words");
                    wordIn = new InputStreamReader(url.openStream());
		        
		    url = new URL(pathStr+collStr+"_dim");
		    dim = new InputStreamReader(url.openStream());

		    url = new URL(pathStr+collStr+"_col_ccs");
		    col = new InputStreamReader(url.openStream());

		    url = new URL(pathStr+collStr+"_row_ccs");
		    row = new InputStreamReader(url.openStream());

                    url = new URL(pathStr+collStr+"_"+scalingStr+".nz");
                    nz = new InputStreamReader(url.openStream());

		   
                }
                else { // file://
                    clusterIn = new FileReader(pathStr+collStr+"_"+scalingStr+
                                               "_doctoclus."+numClust);
                    nameIn = new FileReader(pathStr+collStr+"_docs");
                    wordIn = new FileReader(pathStr+collStr+"_words");
                    dim = new FileReader(pathStr+collStr+"_dim");
		    col= new FileReader(pathStr+collStr+"_col_ccs");
		    row = new FileReader(pathStr+collStr+"_row_ccs");
		    nz = new FileReader(pathStr+collStr+"_"+scalingStr+"_nz");

		    
			
                }

                File f = new File(collStr+"_"+scalingStr+numClust);
                f.mkdir();
		if(args.length == 3)
		    cb.makeClusterBrowser(clusterIn, nameIn, wordIn, dim,col,row,nz, 
                                      documentURL, numClust,
                                      collStr+"_"+scalingStr+numClust);

		else
		    {
			Reader words;
			if(pathStr.startsWith("ftp://"))
			    words= ftpReader(pathStr+collStr+args[3]); 
			else if(pathStr.startsWith("http://")) {
			    URL url = new URL(pathStr+collStr+args[3]);
			    words = new InputStreamReader(url.openStream());
			}
			else
			    words = new FileReader(pathStr+args[3]);
			//System.out.println(pathStr+collStr+args[3]);
			cb.makeClusterBrowser(clusterIn, nameIn, wordIn, dim,col,row,nz, words,
                                      documentURL, numClust,
                                      collStr+"_"+scalingStr+numClust);
		    }
                //new FileReader("");
            }
            catch(IOException e) {
                System.err.println(e);
            }
        }
        else {
            System.err.println("Usage: java -cp CB.jar CB basePath numClusters docURL [word]");
            //example: java -cp CB.jar CB nfs_tfn 10 ftp://ftp.cs.utexas.edu/pub/inderjit/Data/Text/nsf/

            //java ClusterBrowser nsf_tfn 10 sdfsdf
        }

    }

    static Reader ftpReader(String urlStr) {
        try {
            URL url = new URL(urlStr);
            URLConnection uc = url.openConnection();
            int length = uc.getContentLength();
            //System.out.println("Length="+length);
            InputStream in = uc.getInputStream();

            System.out.println("url="+urlStr+" length="+length);

            byte contents[] = new byte[length-1];

            int count = 0;
            for(int i=0; i < length-1; i++) {
                //System.out.println(i);
                contents[i] = (byte)in.read();
                if((count++ % 10000) == 0)
                    System.out.print("#");
            }
            //in.read(contents);
            System.out.println("Done");
            return new StringReader(new String(contents));
        }
        catch(IOException e) {
            System.out.println(e);
            return new StringReader("");
        }
    }

    static String[] getTopWords(String words[], double values[], int count) {
        String topWords[] = new String[count];
        boolean used[] = new boolean[values.length];

        for(int i=0; i < count; i++) {
            int maxIndex = 0;
            double max = -100000.0;
            for(int j=0; j < values.length; j++) {
                if((values[j] > max) && !used[j]) {
                    max = values[j];
                    maxIndex = j;
                }
            }
            //System.out.println("word="+words[maxIndex]+" val="+max);
            topWords[i] = words[maxIndex];
            used[maxIndex] = true;;
        }

        return topWords;
    }

    static public int getTailInt(String str) throws NumberFormatException {
        StringTokenizer tok = new StringTokenizer(str, ".");

        String currToken="x";
        while(tok.hasMoreTokens())
            currToken = tok.nextToken();

        return Integer.parseInt(currToken);
    }

    public void makeClusterBrowser(Reader clusterIn, Reader nameIn, Reader wordIn,
                                   Reader dim, Reader col, Reader row, Reader nz,//modified
				   String descURL, int numClusters, String baseName) {
        
	System.out.println("Loading cluster file");
        int clusterVector[] = loadClusterVector(clusterIn);
	/*try{
	    dim.reset();
	}
	catch(IOException e){
            System.err.println("can't be reseted");
	    }*/
        System.out.println("Loading document names file");
        String names[] = loadNames(nameIn, clusterVector.length);
        System.out.println("Loading words file");
        String words[] = loadWords(wordIn);
        System.out.println("Loading words by documents matrix");
        SparseMatrix docMatrix = new SparseMatrix(dim, col, row, nz);
        if(words.length != docMatrix.m) {
            System.err.println("Words file and doc file have inconsistant sizes");
            System.err.println("Words file has "+words.length+" words");
            System.err.println("Doc file uses "+docMatrix.m+" words");
            return;
        }
        System.out.println("Creating cluster browser");
	//System.out.println(names[0]);
        Document docs[] = compileDocuments(names, docMatrix);
        int clusters[][] = compileClusters(clusterVector, numClusters);
        double clusterWordMeans[][] = 
            compileClusterWordMeans(clusters, docs, words.length);
        sortClusters(clusters, docs);

        produceWordHTMLs(clusterWordMeans, words, baseName);
        produceHTML(baseName, clusters, names, words, docs, clusterWordMeans, descURL);
    }


int [] loadWordCluster(Reader words)
    {
	int[] vector;
	StreamTokenizer tok = new StreamTokenizer(words);
        try {
            tok.nextToken();
            int row = (int)tok.nval;
	    //tok.nextToken();
            //int col = (int)tok.nval;

	    vector= new int[row];
	    for(int i=0; i < row; i++) {
		tok.nextToken();
		vector[i] =(int) tok.nval;
	    }
        }
        catch(IOException e){
            System.err.println("Ill formed matrix file: word cluster");
            vector = null;
        }
	

        return vector;
    }

public void makeClusterBrowser(Reader clusterIn, Reader nameIn, Reader wordIn,
                                   Reader dim, Reader col, Reader row, Reader nz, Reader wordsC,//modified
				   String descURL, int numClusters, String baseName) {
        
	System.out.println("Loading cluster file");
        int clusterVector[] = loadClusterVector(clusterIn);
	System.out.println("Loading word cluster file");
	int wordCluster[] = loadWordCluster(wordsC);
	/*try{
	    dim.reset();
	}
	catch(IOException e){
            System.err.println("can't be reseted");
	    }*/
        System.out.println("Loading document names file");
        String names[] = loadNames(nameIn, clusterVector.length);
        System.out.println("Loading words file");
        String words[] = loadWords(wordIn);
        System.out.println("Loading words by documents matrix");
        SparseMatrix docMatrix = new SparseMatrix(dim, col, row, nz);
        if(words.length != docMatrix.m) {
            System.err.println("Words file and doc file have inconsistant sizes");
            System.err.println("Words file has "+words.length+" words");
            System.err.println("Doc file uses "+docMatrix.m+" words");
            return;
        }
        System.out.println("Creating cluster browser");
	//System.out.println(names[0]);
        Document docs[] = compileDocuments(names, docMatrix);
        int clusters[][] = compileClusters(clusterVector, numClusters);
        double clusterWordMeans[][] = 
            compileClusterWordMeans(clusters, docs, words.length);
        sortClusters(clusters, docs);

        produceWordHTMLs(clusterWordMeans, words, baseName, wordCluster);
        produceHTML(baseName, clusters, names, words, docs, clusterWordMeans, descURL);
    }

void produceWordHTMLs(double clusterWordMeans[][], String words[], String baseName, int wordCluster[]) {
        NumberFormat nf = new DecimalFormat();
        for(int i=0; i < clusterWordMeans.length; i++) {
//             for(int j=0; j < 50; j++)
//                 System.out.print(clusterWordMeans[i][j]+" ");
//             System.out.println();
            //String topWords[] = getTopWords(words, clusterWordMeans[i], 50);
            int topWords[] = createSortedIndex(clusterWordMeans[i]);
	    //System.out.println("towar"+i+clusterWordMeans[i][0]+" "+topWords[0] );
            try {
                PrintWriter out = new PrintWriter(
                      new FileWriter(baseName+"/"+baseName+"_words_"+i+".html"));
                int wordCount = topWords.length;
                if(wordCount > 50)
                    wordCount = 50;
		out.println("<table>");
                for(int j=0; j < wordCount; j++)
		    {
			out.println("<tr>");
			out.println("<td>");
			out.println(words[topWords[j]]);
			out.println("</td>");
			out.println("<td>");
			out.println(
                                nf.format(clusterWordMeans[i][topWords[j]]));
			out.println("</td>");
			out.println("<td>");
			out.println("W#");
			out.println(wordCluster[topWords[j]]);
			out.println("</td>");
			out.println("</tr>");
		    }
		out.println("</table>");
                out.close();
            }
            catch(IOException e) {
                System.err.println(e);
            }
        }
    }


    void produceWordHTMLs(double clusterWordMeans[][], String words[], String baseName) {
        NumberFormat nf = new DecimalFormat();
        for(int i=0; i < clusterWordMeans.length; i++) {
//             for(int j=0; j < 50; j++)
//                 System.out.print(clusterWordMeans[i][j]+" ");
//             System.out.println();
            //String topWords[] = getTopWords(words, clusterWordMeans[i], 50);
            int topWords[] = createSortedIndex(clusterWordMeans[i]);
	    //System.out.println("towar"+i+clusterWordMeans[i][0]+" "+topWords[0] );
            try {
                PrintWriter out = new PrintWriter(
                      new FileWriter(baseName+"/"+baseName+"_words_"+i+".html"));
                int wordCount = topWords.length;
                if(wordCount > 50)
                    wordCount = 50;
		out.println("<table>");
                for(int j=0; j < wordCount; j++)
		    {
			out.println("<tr>");
			out.println("<td>");
			out.println(words[topWords[j]]);
			out.println("</td>");
			out.println("<td>");
			out.println(
                                nf.format(clusterWordMeans[i][topWords[j]]));
			out.println("</td>");
			out.println("</tr>");
		    }
		out.println("</table>");
                out.close();
            }
            catch(IOException e) {
                System.err.println(e);
            }
        }
    }

    public void produceHTML(String baseName, int clusters[][],
                            String names[], String words[], Document docs[],
                            double clusterWordMeans[][],String urlbase) {
        NumberFormat nf = new DecimalFormat();
        try {
            String mainName = new String("Browser_"+clusters.length+".html");
            String clName = new String("CList_"+clusters.length+".html");
            //String cName  = new String("C_"+clusters.length+".html");
	    String cName  = new String("C_"+clusters.length);

            PrintWriter out = new PrintWriter(new FileWriter(baseName+"/"+mainName));
            out.println("<Title> Clustering </Title>");
            out.println("<FRAMESET cols=\"30%,70%\">");
            out.println("<FRAMESET rows=\"35%,65%\">");
            out.println("<FRAME src=\""+clName+"\" name=\"clusterListFrame\">");
            
	    //out.println("<FRAME src=\""+cName+"\" name=\"clusterFrame\">");
            out.println("<FRAME src=\"default2.html\" name=\"clusterFrame\">");
	    out.println("</FRAMESET>");
            //out.println("<FRAME src=\""+baseName+"C.html\" name=\"clusterListFrame\">");
            out.println("<FRAME src=\"default.html\" name=\"documentFrame\">");
            out.println("</FRAMESET>");
				
            out.close();

            out = new PrintWriter(new FileWriter(baseName+"/default.html"));
            out.println("(Documents or words will be loaded in this frame.)");
            out.close();
	    
	    out = new PrintWriter(new FileWriter(baseName+"/default2.html"));
            out.println("(Clusters will be loaded here.)");
            out.close();

            out = new PrintWriter(new FileWriter(baseName+"/"+clName));
            for(int i=0; i < clusters.length; i++) {
                //out.println("<a href=\""+cName+"#Cluster"+i+"\" target=\"clusterFrame\">");
                //out.println("C#"+i+" </a>");
                out.println("<a href=\""+cName+"_Cluster"+i+".html"+ "\" target=\"clusterFrame\">");
                out.println("C#"+i+" </a>");
		out.println("("+clusters[i].length+")");
                String topWords[] = getTopWords(words, clusterWordMeans[i], 1);
                out.println("<a href=\""+baseName+"_words_"+i+
                            ".html\" target=\"documentFrame\">");
                out.println(topWords[0]+"</a>");
                out.println("<br>");
            }
            out.close();

		
            //out = new PrintWriter(new FileWriter(baseName+"/"+cName));
            for(int i=0; i < clusters.length; i++) {
                out = new PrintWriter(new FileWriter(baseName+"/"+cName+"_Cluster"+i+".html"));
                //out.println("<a name=\"Cluster"+i+"\">");
                //out.println("<h3> Cluster #"+i+" </h3> </a>");
		out.println("<h3> Cluster #"+i+" </h3>");
		out.println("<br>");
                for(int j=0; j < clusters[i].length; j++) {
                    String filename = names[clusters[i][j]];
                    out.println(i+":<a href=\""+urlbase+filename+
                                "\" target=\"documentFrame\"> "+filename+
                                "("+nf.format(docs[clusters[i][j]].dist)+") </a>");
                    //System.out.println("filename="+filename);
                    out.println("<br>");
                }
		out.close();
            }
        }
        catch(IOException e) {
            System.err.println(e);
        }
    }

    public int[] createSortedIndex(double V[]) {
      int index[] = new int[V.length];
      for(int i=0; i < V.length; i++) {
          index[i] = i;
      }
      quickSort(index, V, 0, index.length-1);

      return index;
	}

    public void sortClusters(int clusters[][], Document docs[]) {
      double V[] = new double[docs.length];
      for(int i=0; i < V.length; i++)
          V[i] = docs[i].dist;

      for(int i=0; i < clusters.length; i++)
          quickSort(clusters[i], V, 0, clusters[i].length-1);
	}

    private void quickSort(int A[], double V[], int p, int r) {
		if(p < r) {
			int q = partition(A, V, p, r);
			quickSort(A, V, p, q);
			quickSort(A, V, q+1, r);
		}
	}

	private int partition(int A[], double V[], int p, int r) {
		int x = A[p];
		int i = p-1;
		int j = r+1;

		while(true) {
			do {
				j--;
			}
			while(V[A[j]] < V[x]);

			do {
				i++;
			}
			while(V[A[i]] >  V[x]);

			if(i < j) {
				int temp = A[i];
				A[i] = A[j];
				A[j] = temp;
			}
			else
				return j;
		}
	}


    public double[][] compileClusterWordMeans(int clusters[][], Document docs[], 
                                          int wordCount) {
        double clusterWordMeans[][] = new double[clusters.length][wordCount];
        double x, y;

        for(int i=0; i < clusters.length; i++) {
	    for(int j=0; j < clusters[i].length; j++) {
                Document doc = docs[clusters[i][j]];
		for(int k=0; k < doc.words.length; k++) 
                    {
			clusterWordMeans[i][doc.words[k]] += doc.wordCounts[k];
		
		    }
            }
	   

            // Normalize cluster mean
            double clusterLength = 0.0;
            for(int j=0; j < wordCount; j++)
                clusterLength += clusterWordMeans[i][j]*clusterWordMeans[i][j];
            clusterLength = Math.sqrt(clusterLength);
            
            for(int j=0; j < wordCount; j++)
                clusterWordMeans[i][j] /= clusterLength;
	    
	   
            // Compute distance to cluster center
            for(int j=0; j < clusters[i].length; j++) {
                Document doc = docs[clusters[i][j]];
                doc.dist = 0.0;
                for(int k=0; k < doc.words.length; k++) { 
                    doc.dist += clusterWordMeans[i][doc.words[k]]*doc.wordCounts[k];
                }
            }
        }

        return clusterWordMeans;
    }

    public int[][] compileClusters(int clusterVector[], int numClust) {
        int clusters[][] = new int[numClust][];
        int clusterCounts[] = new int[numClust];

        for(int i=0; i < clusterVector.length; i++) {
            clusterCounts[clusterVector[i]]++;
        }
        for(int i=0; i < numClust; i++) {
            clusters[i] = new int[clusterCounts[i]];
            clusterCounts[i] = 0;
        }
        for(int i=0; i < clusterVector.length; i++) {
            //System.out.println(clusterVector[i]+" "+clusterCounts[i]+" "+
            //                clusters[clusterVector[i]].length);
            clusters[clusterVector[i]][clusterCounts[clusterVector[i]]++] = i;
        }

        return clusters;
    }

    public Document[] compileDocuments(String names[], SparseMatrix docMatrix) {
        Document docs[] = new Document[names.length];
	
	//System.out.println(" docs.length="+ docs.length+"\n");
        for(int i=0; i < docs.length; i++) {
            int wordCount = docMatrix.colPtrs[i+1] - docMatrix.colPtrs[i];
            //System.out.println(" WordCount"+i+"="+wordCount+"\n");
	    int words[] = new int[wordCount];
            float counts[] = new float[wordCount];
            for(int j=0; j < wordCount; j++) {
                words[j] = (int)docMatrix.rowIndices[docMatrix.colPtrs[i]+j];
                counts[j] = docMatrix.val[docMatrix.colPtrs[i]+j];
            }
            docs[i] = new Document(names[i], words, counts);
	    //free words;
	    //free counts;
            double length = 0.0;
            for(int j=0; j < counts.length; j++)
                length += counts[j]*counts[j];
            length = Math.sqrt(length);
            
            for(int j=0; j < wordCount; j++)
                counts[j] /= length;
        }

        return docs;
    }

    public String[] loadWords(Reader in) {
        StreamTokenizer tok = new StreamTokenizer(in);
        String words[];
        try {
            tok.nextToken();
            int count = (int)tok.nval;
            words = new String[count];
            for(int i=0; i < count; i++) {
                tok.nextToken();
                words[i] = tok.sval;
            }
        }
        catch(IOException e){
            System.err.println("Ill formed matrix file: load words");
            words = null;
        }

        return words;
    }

    public String[] loadNames(Reader in, int n) {
        String docs[] = new String[n];
        BufferedReader buffin = new BufferedReader(in);
		
        try {
            for(int i=0; i < n; i++) {
                String line = buffin.readLine();
                StringTokenizer tok = new StringTokenizer(line, ":");	   
                int key = Integer.parseInt(tok.nextToken());
                //docs.put(new Integer(key), tok.nextToken().trim());
                docs[i] = tok.nextToken().trim();
            }
        }
        catch(IOException e){
            System.err.println("Bad Name File");
        }
        return docs;
    }

    public int[] loadClusterVector( Reader in) {

	int[] vector;
	//BufferedReader bufIn = new BufferedReader( in );
	//StreamTokenizer tok = new StreamTokenizer(bufIn);
	Scanner inFile = new Scanner(in);

        try {
	    /*
            tok.nextToken();
            int row = (int)tok.nval;*/

	    int row = inFile.nextInt();
	    inFile.nextLine();

	    //System.out.println("Reading: " + tok.toString());
	    //System.out.println("There are " + row + " documents in this clustering.");

	    vector= new int[row];
	    for(int i=0; i < row; i++) {

		/*
		tok.nextToken(); // Read cluster number for ith document
		*/
		try{
		    vector[i] = inFile.nextInt();
		//System.out.println("Reading: " + tok.toString());

		// NOTE: by default "/" is a COMMENT character, so the
		// file name listed after the cluster number of each
		// document is NOT read by the StreamTokenizer

		    /*
		if (tok.ttype == tok.TT_NUMBER) {
		    vector[i] = (int)tok.nval;
		} else {
		    throw new IOException("loadClusterVector: expected a number");
		}
		    */

		}catch (InputMismatchException e){
		    System.err.println("loadClusterVector: expected a number");
		}
		//System.out.println("The " + i + "th document is in cluster: " + vector[i]);

		/*
		tok.nextToken(); // Skip file name
		*/
		inFile.nextLine();
		
		//System.out.println("Skipping: " + tok.toString() + "\n");
		//tok.nextToken();
		//System.out.println(tok.toString());
	    }
        }
        catch(InputMismatchException e){
            System.err.println("Ill formed matrix file: load cluster vector");
            vector = null;
        }
	

        return vector;
    }

    public double[][] loadMatrix(Reader in) {
        double matrix[][];

        StreamTokenizer tok = new StreamTokenizer(in);
        try {
            tok.nextToken();
            int row = (int)tok.nval;
            tok.nextToken();
            int col = (int)tok.nval;
            //System.out.println("Row, Col=("+row+", "+col+")");
            matrix = new double[row][col];
            for(int j=0; j < col; j++)
                for(int i=0; i < row; i++) {
                    tok.nextToken();
                    matrix[i][j] = tok.nval;
                }
        }
        catch(IOException e){
            System.err.println("Ill formed matrix file");
            matrix = null;
        }
        return matrix;
    }
}
