/*
 * Copyright (c) 2010 Gerhard Beck.  All rights reserved.
 *
 * Subject to the GNU GENERAL PUBLIC LICENSE,
 * Version 3, 29 June 2007 http://www.gnu.org/licenses/gpl.html
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL GERHARD BECK OR
 * OTHER CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

package org.gerhardb.jibs.textPad.ranker;

import java.io.*;
import java.util.*;

import javax.swing.JProgressBar;

/**
 * Ranks a list of files based on the counts of various words in the file.
 * 
 * Designed to work with files produced by WordCount.  WordCount produces
 * files in the following format:
 * FullPathToFile,sub,submarine,ship
 * D:\dev\test\article_1.txt,1,0,1
 * D:\dev\test\article_2.txt,4,0,0
 * 
 * You need to provide a file with the scoring for words you care about.
 * The scoring file may have different, more or less words than those counted.
 * If none of the scoring file words are in the original WordCount list, all 
 * files will have a rank of zero.
 * 
 * The scoring file has the following format:
 * # Lines starting with # are comments and will be ignored.
 * # Format for a line is:
 * label: word_a,word_b: 20->400,0->200
 * 
 * # White space is ignored so these two lines are equivalent:
 * label:word_a,word_b:20->400,0->200
 * label: word_a, word_b: 20->400, 0->200  
 * 
 * # 20->400 means that if more than 20 of the word are found, 400 is added to the score.
 * # 0->200 means that if any of the word are found,  then add 200 to score
 * 
 * # labels may not start with "jibs_"
 * a: sub,submarine; 0>300,5>500,10>700,20>1000
 * 
 * # The label is optional.  If the label is not used, just start with the words as in:
 * navy;0>13,10>18
 * # 
 * # You can also score on groups of previously scored words or groups.  
 * # The format for using groups is:
 * group_label: (label_a,label_b); 0>300
 * 
 * # Labels are also optional for groups:
 * (a,f);0>666
 * 
 * @author Gerhard Beck
 */
public class RankByWordCounts
{
	// ==========================================================================
	// Items which are initialized when reading in file and are not modified
	// subsequently.
	// ==========================================================================
	
	/**
	 * List of files to be ranked read in from scoringFileName.
	 * Order is the order they are found in scoringFileName.
	 * Index must be tracked to correctly get the file name after the scorings
	 * are ordered from high to low. 
	 * 
	 * The first item in myFileList should be the token row from the WordCount file
	 * labeled "File".  So you can use this array as is starting from zero to 
	 * print out the column of files with the label used in WordCount. 
	 */
	ArrayList<String> myFileList;	
	
	/**
	 * List of the words which were counted by WordCount in the order they appear
	 * in the WordCount file read in.
	 * 
	 * The first item in myTokenList should be the label for the file column
	 * labeled "File".  So you can use this array as is starting from zero to 
	 * print out the first row used in WordCount. 
	 */
	String[] myTokenList = null;
	
	boolean iStop;
	JProgressBar myProgress; 
	String myRootDirectoryScanned;
	
	/**
	 * 
	 */
	RankByWordCounts(JProgressBar progress)
	{
		this.myProgress = progress;
	}
	
	void stop()
	{
		this.iStop = true;
	}

	/**
	 * 
	 * @param scoringFileName File with scoring for each word.
	 * @param inFileName File produced by WordCount.
	 * @param outFileName File where the ranked results will go.  
	 * Overwrites whatever file of this name is already there.
	 * @throws Exception
	 */
	void rank(String wordCountFileName, String scoringFileName, String outFileName, boolean html) throws Exception
	{
		this.myProgress.setString("");
		this.myProgress.setValue(0);
		
		logln("wordCountFileName: " + wordCountFileName, false);
		logln("scoringFileName: " + scoringFileName, false);
		logln("outFileName: " + outFileName, false);
		
		this.iStop = false;
		
		if (wordCountFileName.equals(outFileName))
		{
			throw new Exception("File names must be different.");
		}
		

		File scoringFile = new File(scoringFileName);
		File wordCoutnFile = new File(wordCountFileName);
		File outFile = new File(outFileName);		
		
		if (!scoringFile.exists())
		{
			throw new Exception("Scoring File must exist: " + scoringFile);
		}
			
		if (!wordCoutnFile.exists())
		{
			throw new Exception("Word Count File must exist: " + wordCoutnFile);
		}
			
		// Do the actual work!! --------------------------------------------------
		
		// Read in the Scoring File to get the scoring factors. 
		// These will be used to score each row.
		ScoreARow rowScorer = new ScoreARow(this, scoringFile);
		
		// Read in counts from file.
		// Returns wordCounts[row][col]
		// First column is index, last column is blank for score.
		// Last column with score is used to print the score.
		// Also sets myFileList.
		int[][] wordCounts = readWordCounts(wordCoutnFile);	
		
		// Based on: wordCounts[row][col]
		IndexedScore[] ranking = new IndexedScore[wordCounts.length];
		this.myProgress.setMaximum((int)(ranking.length * 1.1));
		for(int row=0; row<wordCounts.length; row++)
		{
			if (this.iStop){return;}
			ranking[row] = rowScorer.score(row, wordCounts[row], this.myFileList.get(row));
			this.myProgress.setValue(row);
		}
				
		// Sort the rankings
		Arrays.sort(ranking);		
		this.myProgress.setValue(ranking.length + (int)(ranking.length * 0.02));
		
		// Write back to disk.
		writeCVS(outFile, wordCounts, ranking);
		this.myProgress.setValue(ranking.length +  + (int)(ranking.length * 0.06));

		if (html)
		{
			writeHTML(wordCoutnFile, outFileName, wordCounts, ranking);
		}
		this.myProgress.setValue(ranking.length +  + (int)(ranking.length * 0.10));
		
		logln("");
		logln("---------------------------------------------------");
		logln("Done", true);				

		this.myProgress.setValue(this.myProgress.getMaximum());
		this.myProgress.setString("Done");
	}
	
	// ==========================================================================
	// Reading
	// ==========================================================================

	/**
	 * Returned list is in [row][col] format.
	 * Row 0 is the first row with real data after the labels so it represents row 1 in the WordCount file.
	 * Col 0 is the index of the row in the WordCount file.
	 * A column is added at the end for the score to go into.
	 */
	@SuppressWarnings("null")
	int[][] readWordCounts(File inFile) throws Exception
	{
		String errMsg = null;
		logln("\n---------------------------------------------------");
		logln("Reading in count file: " + inFile);
		
		this.myFileList = new ArrayList<String>(200);	
		
		// First pass just gets file names & tokens from first line.
		BufferedReader buffRead = null;
		int[][] rtnMe = new int[0][0];
		try
		{
			buffRead = new BufferedReader(new FileReader(inFile));
			
			// First line which has the directory scanned which we need to put in output file.
			String aLine = buffRead.readLine();
			this.myRootDirectoryScanned = aLine.substring(WordCount.TREE_BASE.length());
			logln("RootDirectoryScanned: " + this.myRootDirectoryScanned);
			
			// Now we can get to work starting with the second lien.
			aLine = buffRead.readLine();
			boolean grabTokensFromFirstLine = true;
			while (aLine != null)
			{
				logln(aLine);
				if (this.iStop){return rtnMe;}
				String[] splitFileNameFromHeaders = aLine.split(WordCount.FILE_TOKEN);
				// Gets the file names
				this.myFileList.add(splitFileNameFromHeaders[0].trim());
				
				// Do once...
				// Gets the tokens from the first row.
				// List will include the first column "file"
				if (grabTokensFromFirstLine)
				{
					grabTokensFromFirstLine = false;
					
					if (splitFileNameFromHeaders.length>1)
					{
						String[] countHeadings = splitFileNameFromHeaders[1].split(",");
						
						// Size is one bigger for the split off file name.
						this.myTokenList = new String[countHeadings.length + 1];
						// Add the first token to the file name without the WordCountUI.FILE_TOKEN.
						this.myTokenList[0] = splitFileNameFromHeaders[0].trim();
	
						for(int i=0;i<countHeadings.length;i++)
						{
							this.myTokenList[i+1] = countHeadings[i].trim();
						}
						
						/*
						System.out.print("LIST OF TOKENS: ");
						for(int i=0;i<this.myTokenList.length;i++)
						{
							if (i>0)
							{
								System.out.print(", ");							
							}
							System.out.print(this.myTokenList[i]);
						}
						System.out.println("");
						*/
					}
					else
					{
						this.myTokenList = new String[1];
						// Add the first token to the file name without the WordCountUI.FILE_TOKEN.
						this.myTokenList[0] = splitFileNameFromHeaders[0].trim();
					}						
				}
			aLine = buffRead.readLine();
			}
			buffRead.close();

			// Here is the array we need.
			// Numbers are token count - 1
			// Then add two columns, one for original index and one for score
			// Final number is token count + 1
			// TokenCount will be:
			// 0: index to look up file name
			// 1-whatever: tokens in original file
			// tokenCount - 1: the score
			// wordCounts will not have the header so it is one shy of file list count.
			// Add 1 to token count for the score column...
			rtnMe = new int[this.myFileList.size() - 1][this.myTokenList.length + 1];
			
			// Fill the array
			buffRead = new BufferedReader(new FileReader(inFile));
		   aLine = buffRead.readLine();
		   // Added because we need to skip the first line.
		   aLine = buffRead.readLine();
		   // Added because we need to also skip the second line.
		   aLine = buffRead.readLine();
		   int aValue = 0;
		   int rowCount = 0;
			while (aLine != null)
			{
				//logln(aLine);
				
				// First slot is the index.  
				// Index starts from 1 because zero slot in file name array is headers. 
				// So real file names start at 1.
				rtnMe[rowCount][0] = rowCount + 1;
				
				String[] splitFileNameFromCounts = aLine.split(WordCount.FILE_TOKEN);				
				if (splitFileNameFromCounts.length>1)
				{
					logln("splitFileNameFromCounts[1]: " + splitFileNameFromCounts[1]);
					String[] tokens = splitFileNameFromCounts[1].split(",");
					for(int i=0;i<tokens.length;i++)
					{
						aValue = 0; // initialize in case there is an error.
						if (tokens[i] != null && tokens[i].length() != 0)
						{							
							try{aValue = Integer.parseInt(tokens[i].trim());}
							catch(Exception ex)
							{
								throw new Exception("Cound not get count from: " + tokens[i].trim() + ": " + ex.getMessage()); 
							}
						}
						// One off for the file name.
						rtnMe[rowCount][i+1] = aValue;
					}
				}
				aLine = buffRead.readLine();
				rowCount++;
			}
			buffRead.close();
		}
		catch(Exception ex)
		{
			errMsg = ex.getMessage();			
			ex.printStackTrace();
		}
		finally
		{
			try
			{
				buffRead.close();
			}
			catch(Exception ex)
			{
				ex.printStackTrace();
			}
		}	
		if (errMsg != null)
		{
			throw new Exception(errMsg);
		}	
		return rtnMe;
	}
	
	// ==========================================================================
	// Writers
	// ==========================================================================
	void writeHTML(File inFile, String outFileName, int[][] wordCounts, IndexedScore[]ranking)
	throws Exception
	{
		int increment = 50;
		int stop = 0;
		int page = 1;
		for(int i=0; i<ranking.length;)
		{
			stop = i + increment;
			writeHTML(inFile, outFileName, wordCounts, ranking, i, stop, page);
			page++;
			i=stop;
		}
	}


	/**
<table border="1">
<tr>
<th>Header 1</th>
<th>Header 2</th>
</tr>
<tr>
<td>row 1, cell 1</td>
<td>row 1, cell 2</td>
</tr>
<tr>
<td>row 2, cell 1</td>
<td>row 2, cell 2</td>
</tr>
</table> 
	 * @param outFile
	 * @param ranking
	 * start should be zero.
	 */
	@SuppressWarnings("null")
	void writeHTML(File inFile, String outFileName, int[][] wordCounts, IndexedScore[]ranking, int start, int stop, int page) 
	throws Exception
	{
		String errMsg = null;
		File outHTML = new File(outFileName + "_" + page + ".html");		
		//logln( "Start: " + start + "   Stop: " + stop + "     Show: " + (start+1) + "   Stop: " + (stop));
		if (stop >= ranking.length)
		{
			stop = ranking.length;
		}
		
		PrintWriter outWriter = null;
		try
		{
			outWriter = new PrintWriter(
					new BufferedWriter( new FileWriter(outHTML)));
			
			outWriter.println("");
			outWriter.println("");
			outWriter.println("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">");
			outWriter.println("<html>");
			outWriter.println("<head>");
			outWriter.print("<title>");
			outWriter.print(inFile.toString());
			outWriter.println("</title>");
			outWriter.println("</head>");
			outWriter.println("<body>");	
			
			// Set up for page scrolling
			int nextPage = 0;
			if (stop < ranking.length)
			{
				nextPage = page + 1;
			}
			
			// Top page scrollers
			page(outWriter, outFileName, page, nextPage);
			
			outWriter.println("<table border=\"1\">");

			outWriter.println("<tr>");
			
			outWriter.print("<th>");
			outWriter.print("#");
			outWriter.println("</th>");
			
			outWriter.print("<th>");
			outWriter.print(this.myTokenList[0]);
			outWriter.println("</th>");
			
			outWriter.print("<th>");
			outWriter.print("Index");
			outWriter.println("</th>");
			
			outWriter.println("<th>");
			outWriter.print("Score");
			outWriter.println("</th>");
			
			for(int i=1; i<this.myTokenList.length; i++)
			{
				outWriter.print("<th>");
				outWriter.print(this.myTokenList[i]);
				outWriter.println("</th>");
			}
			
			outWriter.println("</tr>");
			
			for(int rows=start; rows<stop; rows++)
			{
				outWriter.println("<tr>");
				
				outWriter.print("<td>");
				outWriter.print(rows + 1);
				outWriter.println("</td>");
				
				
					// Use the index column to look up the correct file.
				outWriter.println("<td>");
				outWriter.print("<a href=\"file:///"); 
				
				// Replace % with %25 so html will read %20 as a %20 instead of space.
				// Because many files had %20 by html space convention.
				String htmlFileName = this.myFileList.get(ranking[rows].myIndex);
				htmlFileName = htmlFileName.replaceAll("%", "%25");
				outWriter.print(htmlFileName); 
				
				outWriter.print("\" target=\"_blank\" >");
				outWriter.print(this.myFileList.get(ranking[rows].myIndex));
				outWriter.print("</a>");
				outWriter.println("</td>");

				// Compute correct row in the ranking array.
				int lookedUpRow = ranking[rows].myIndex - 1;
				
				// Print index
				outWriter.println("<td>");
				outWriter.print(wordCounts[lookedUpRow][0]);
				outWriter.println("</td>");

				// Print score from last position in array
				int lastColumn = wordCounts[rows].length-1;
				outWriter.println("<td>");
				outWriter.print(wordCounts[lookedUpRow][lastColumn]);
				outWriter.println("</td>");
				
				// Print actual counts		
				for(int i=1; i<lastColumn; i++)
				{
					outWriter.println("<td>");
					outWriter.print(wordCounts[lookedUpRow][i]);
					outWriter.println("</td>");
				}
				
				outWriter.println("</tr>");
			}			
			
			outWriter.println("</table>");
			
			outWriter.println("Page: " + page);
			
			// Bottom page scrollers
			page(outWriter, outFileName, page, nextPage);
			
			outWriter.println("</body>");	
			outWriter.println("</html>");
		}
		catch(Exception ex)
		{
			errMsg = ex.getMessage();
			ex.printStackTrace();
		}
		finally
		{
			try
			{
				outWriter.close();
			}
			catch(Exception ex)
			{
				ex.printStackTrace();
			}
		}
		if (errMsg != null)
		{
			throw new Exception(errMsg);
		}
	}
	
	void page(PrintWriter outWriter, String outFileName, int page, int nextPage)
	{
		outWriter.println("Page: " + page + " &nbsp; &nbsp; &nbsp; &nbsp;");
		
		if (page > 1)
		{
			int priorPage = page - 1;
			outWriter.print("<a href=\"file:///"); 
			outWriter.print(outFileName + "_" + priorPage + ".html"); 
			outWriter.print("\" >Previous</a>");
		}

		outWriter.print(" &nbsp; &nbsp; &nbsp; &nbsp; ");
		
		if (nextPage > 0)
		{
			outWriter.print("<a href=\"file:///"); 
			outWriter.print(outFileName + "_" + nextPage + ".html"); 
			outWriter.print("\" >Next</a>");
		}		
	}
	
	@SuppressWarnings("null")
	void writeCVS(File outFile, int[][] wordCounts, IndexedScore[]ranking) throws Exception
	{
		String errMsg = null;
		PrintWriter outWriter = null;
		try
		{
			outWriter = new PrintWriter(
					new BufferedWriter( new FileWriter(outFile)));
			
			// First line is the directory of the tree base.
			outWriter.print(WordCount.TREE_BASE);
			outWriter.println(this.myRootDirectoryScanned);
			
			// Back to work on second line.
			outWriter.print(this.myTokenList[0]);
			outWriter.print(',');
			outWriter.print("Index");
			outWriter.print(',');
			outWriter.print("Score");
			for(int i=1; i<this.myTokenList.length; i++)
			{
				outWriter.print(',');
				outWriter.print(this.myTokenList[i]);
			}
			outWriter.println("");
			
			for(int rows=0; rows<ranking.length; rows++)
			{
				// Use the index column to look up the correct file.
				outWriter.print(this.myFileList.get(ranking[rows].myIndex)); 
				
				// Set up for moving score from last position...
				int lookedUpRow = ranking[rows].myIndex - 1;
				
				// Print index
				outWriter.print(',');
				outWriter.print(wordCounts[lookedUpRow][0]);

				// Print score from last position in array
				outWriter.print(',');
				int lastColumn = wordCounts[rows].length-1;
				outWriter.print(wordCounts[lookedUpRow][lastColumn]);
				
				// Print actual counts		
				for(int col=1; col<lastColumn; col++)
				{
					outWriter.print(',');
					outWriter.print(wordCounts[lookedUpRow][col]);
				}
				outWriter.println("");
			}
		}
		catch(Exception ex)
		{
			errMsg = ex.getMessage();			
			ex.printStackTrace();
		}
		finally
		{
			try
			{
				outWriter.close();
			}
			catch(Exception ex)
			{
				ex.printStackTrace();
			}
		}
		if (errMsg != null)
		{
			throw new Exception(errMsg);
		}
	}
	
	static void logln(String logMe)
	{
		logln(logMe, false);
	}
	
	static void logln(String logMe, boolean force)
	{
		if (false || force)
		{
			System.out.println(logMe);
		}
	}

	
	// ==========================================================================
	// Main
	// ==========================================================================

	/**
	 * @param args
	 */
	public static void main(String[] args) throws Exception
	{
		String path = "D:/dev/test";
		RankByWordCounts ranker = new RankByWordCounts(new JProgressBar());
			ranker.rank(
				path + "/JIBS_Word_Count.csv",
				path + "/scoring.txt",
				path + "/JIBS_RANKED_Word_Count.csv",
				true);

		/*
		RankByWordCounts ranker = new RankByWordCounts();
			ranker.rank(
				"/win_d/dev/test/JIBS_Word_Count.csv",
				"/win_d/dev/test/scoring.txt",
				"/win_d/dev/test/JIBS_RANKED_Word_Count.csv");
				*/
	}

}
