/home/wpollock1/public_html/restricted/Java2/SearchEngine/com/wpollock/searchengine/IndexUtils.java

package com.wpollock.searchengine;

import java.awt.FileDialog;
import java.io.*;
import java.util.*;

import javax.swing.JOptionPane;
import javax.swing.JTextPane;
import static com.wpollock.searchengine.Main.*;

/**
 * To keep each file small and focused, I put the file read/write code
 * and the searching code in this class, instead of Main.
 *
 * @author wpollock
 */

class IndexUtils {
	// The three search methods display the files, one per line, that
	// match the search terms.  The resulting JTextPane is selectable, but
	// not editable.  That allows users to select and copy filenames.

	static Map<String, SortedSet<DocPos>> invertedIndex = new HashMap<>();

	/** Does an AND search.  The idea is to start with a set of all document
	 * IDs in the index.  Then for each term, remove any documents that don't
	 * match.  The resulting final set is the documents matching all search
	 * terms.  (If user inputs no words, e.g. " ,. ", then all docs are
	 * considered as matching.)
	 *
	 * @param searchTerms A String of the user-supplied search terms.
	 * @param resultsPane The TextComponent to display the results in.
	 */
	static void doAndSearch( String searchTerms, JTextPane resultsPane ) {
		Set<Long> files = FileListModel.getModel().getAllDocIDs();

		// split into words on any non-letter, non-digit, and non-dash:
		String[] words = searchTerms.split( "[^a-zA-Z0-9-]+" );
		// (Note, if user input contains no words, all docs match!)

		for ( String word : words ) {
			if ( files.size() == 0 )  break;  // No files left to examine.

			Set<DocPos> positions = invertedIndex.get( word.toLowerCase() );
			if ( positions == null ) {
				files.clear();  // This word is not in index, so the
				break;          // AND search produced no matching files.
			}
			// Build the set of docIDs that contain the search term:
			Set<Long> filesContainingWord = new TreeSet<>();
			for ( DocPos position : positions )
				filesContainingWord.add( position.docID );

			// Remove any docIDs from files that don't contain the search term:
			files.retainAll( filesContainingWord );  // Not portable!
		}

		// Convert the final set of matching docIDs to filenames:
		StringBuilder results = new StringBuilder();

		// Add appropriate title to results:
		if ( files.size() > 0 )
			results.append("   Matching Files:\n" );
		else
			results.append("   No Matching Files found." );

		// Add each file name found to the results:
		for ( Long docID : files )
			results.append( "\n" + FileListModel.getModel().getFileName(docID) );

		resultsPane.setText( results.toString() );
	}

	/** Does an OR search.
	 * The general idea is to start with an empty set of matching files, and
	 * add to it for each search term that is found in the index.  The result
	 * is the OR search results.  (If user inputs no words, e.g. " ,. ", then
	 * no docs are considered as matching.)
	 *
	 * @param searchTerms A String of the user-supplied search terms.
	 * @param resultsPane The TextComponent to display the results in.
	 */
	static void doOrSearch( String searchTerms, JTextPane resultsPane ) {
		Set<Long> files = new TreeSet<>();  // Set of matching docIDs

		// split into words on any non-letter, non-digit, and non-dash:
		String[] words = searchTerms.split( "[^a-zA-Z0-9-]+" );

		for ( String word : words ) {
			Set<DocPos> positions = invertedIndex.get( word.toLowerCase() );
			if ( positions == null ) {
				continue;  // This word is not in index.
			}
			for ( DocPos position : positions ) {
				files.add( position.docID );  // Add the file ID to a set
			}
		}

		// Convert the final set of matching docIDs to filenames:
		StringBuilder results = new StringBuilder();

		// Add appropriate title to results:
		if ( files.size() > 0 )
			results.append("   Matching Files:\n" );
		else
			results.append("   No Matching Files found." );

		// Add each file name found to the results:
		for ( Long docID : files )
			results.append( "\n" + FileListModel.getModel().getFileName(docID) );

		resultsPane.setText( results.toString() );
	}

	/** Does a PHRASE search.
	 * The general algorithm is: build a set of DocPos objects for the
	 * documents that contain the first word.  Then for each remaining
	 * word in the phrase, check each DocPos in the set to see if the word
	 * is in the same document, in the next position.  If not, remove that
	 * DocPos from the set.  Repeat until set is empty (so phrase not found),
	 * or until the last word of the phrase has been processed.
	 *
	 * @param searchTerms A String of the user-supplied search terms.
	 * @param resultsPane The TextComponent to display the results in.
	 */
	static void doPhraseSearch( String searchTerms, JTextPane resultsPane ) {
		Set<DocPos> filesContainingPhraseSoFar = new TreeSet<>();

		// Split into words on any non-letter, non-digit, and non-dash:
		String[] words = searchTerms.toLowerCase().split( "[^a-z0-9-]+" );

		// Add all the DocPos objects for the first word to the set:
		if ( words.length > 0 ) {
			filesContainingPhraseSoFar.addAll(	invertedIndex.get( words[0] ) );
			// Remove the first word from the phrase:
			words = Arrays.copyOfRange(words, 1, words.length );
		}

		for ( String word : words ) {
			if ( filesContainingPhraseSoFar.size() == 0 )
				break;  // No files left to examine.

			// Find all docs (and positions) of next word in phrase:
			Set<DocPos> positions = invertedIndex.get( word );
			if ( positions == null ) {
				filesContainingPhraseSoFar.clear();  // Word is not in index,
				break;          // so the phrase isn't found.
			}
			// Remove any candidate if the phrase so far isn't followed by word:
			Set<DocPos> newPhraseEnds = new TreeSet<>();
			Iterator<DocPos> it = filesContainingPhraseSoFar.iterator();
			while ( it.hasNext() ) {
				DocPos lastWordinPhraseSoFar = it.next();
				// The next word must be in that document, next position:
				DocPos nextWord = new DocPos( lastWordinPhraseSoFar.docID,
						lastWordinPhraseSoFar.pos + 1 );
				if ( positions.contains(nextWord) )
					newPhraseEnds.add(nextWord);
			}
			filesContainingPhraseSoFar = newPhraseEnds;
		}

		// Convert the final set of matching docIDs to filenames:
		StringBuilder results = new StringBuilder();

		// Add appropriate title to results:
		if ( filesContainingPhraseSoFar.size() > 0 )
			results.append("   Matching Files:\n" );
		else
			results.append("   No Matching Files found." );

		// Add each file name found to the results:
		Set<Long> docsContainingPhrase = new HashSet<>();
		for ( DocPos dp : filesContainingPhraseSoFar ) {
			docsContainingPhrase.add( dp.docID );
		}
		FileListModel model = FileListModel.getModel();
		for ( long docID : docsContainingPhrase )
			results.append( "\n" + model.getFileName(docID) );

		resultsPane.setText( results.toString() );
	}

	/** Adds all the words in a user-selected file to the index.
	 *  Words consist of letters, digits, and hyphens only.  Before
	 *  adding to the inverted index, the word is coerced to lowercase.
	 */
	static void addFileToIndex() {
		// Have user select a file to be indexed:
		String lastDir = prefs.get(LAST_DIRECTORY, "." );

		// Use AWT for the file dialog, to get native L&F:
		FileDialog fd = new FileDialog(Main.maintenanceWindow,
				"Select file to index", FileDialog.LOAD );
		fd.setDirectory(lastDir);
		fd.setIconImage(Main.appIcon.getImage());  // TODO: fix this.
		fd.setVisible(true);
		File[] selectedFiles = fd.getFiles();
		if ( selectedFiles.length == 0 )
			return; // User cancelled, so nothing to do.
		File file = selectedFiles[0];  // single file selection mode.

		// Validate file can be read:
		if ( ! file.exists() || ! file.isFile() || ! file.canRead() ) {
			JOptionPane.showMessageDialog(null,
					"File \"" + file.getName() + "\" can't be indexed!" );
			return;
		}

		lastDir = file.getParent();
		prefs.put( LAST_DIRECTORY, lastDir );  // Save last used directory

		// If file is already indexed, done; else add to filelist:
		String fileName = null;
		try { fileName = file.getCanonicalPath();  // Unique absolute pathname.
		} catch (IOException ioe) { ioe.printStackTrace(); }

		FileListModel model = FileListModel.getModel();
		if ( model.contains( fileName ) )
			return;  // File already indexed, so nothing to do.
		long docID = model.addFile( fileName );
		addDocData ( docID );

		// Save updated index to file:
		model.saveIndexToFile();
	}

	static void removeSelectedFilesFromIndex() {
		FileListModel model = FileListModel.getModel();
		Set<Long> selectedFiles = model.getSelectedFiles();
		if ( selectedFiles.size() == 0 )
			return;  // No files selected.

		// Use JOptionPane to confirm, then remove each one at a time:
		String filesToRemove = "";
		for ( long docID : selectedFiles ) {
			filesToRemove += model.getFileName(docID)
				+ "\n";
		}
		int status = JOptionPane.showConfirmDialog(maintenanceWindow,
				"Remove these files from the index:\n" + filesToRemove );
		if ( status != JOptionPane.OK_OPTION )
			return;

		for ( long docID : selectedFiles ) {
			removeDocData( docID );
			// Remove file from model (the JTable):
			model.removeFile( docID );
		}

		// Save updated index to file:
		model.saveIndexToFile();
	}

	static void removeDocData ( long docID ) {
		Set<String> keys = new HashSet<>(invertedIndex.keySet() );

		// Remove every DocPos from the index that refers to docID:
		for ( String word : keys ) {
			Set<DocPos> data = invertedIndex.get(word);
			Iterator<DocPos> it = data.iterator();
			while ( it.hasNext() ) {
				DocPos dp = it.next();
				if ( dp.docID == docID ) {
					it.remove();
				}
			}

			// Delete word from index if no remaining documents contain it:
			if ( data.size() == 0 )
				invertedIndex.remove(word);
		}
	}

	static void addDocData ( long docID ) {
		// Add each word in the file to index (convert to lowercase first):
		Scanner in = null;
		try {
			in = new Scanner(
					new File( FileListModel.getModel().getFileName(docID) ) );
			int position = 0;
			while ( in.hasNext() ) {
				String word = in.next().toLowerCase();
				// Ignore non-words:
				word = word.replaceAll( "[^a-zA-Z0-9-]+", "" );
				if ( word.length() == 0 )  continue;

				DocPos dp = new DocPos( docID, position );
				if ( ! invertedIndex.containsKey(word)) {
					invertedIndex.put( word, new TreeSet<DocPos>() );
				}
				Set<DocPos> docPosForWord = invertedIndex.get( word );
				docPosForWord.add( dp );
				++position;
			}
		}
		catch ( Exception e ) { e.printStackTrace(); }
		finally {
			if ( in != null )
				in.close();
		}
	}

	static void updateIndex() {
		FileListModel model = FileListModel.getModel();
		int numFiles = model.getRowCount();
		if ( numFiles == 0 )
			return;  // No files in index
		Set<Long> docsNeedingUpdate = new HashSet<>();

		for ( int i = 0; i < numFiles; ++i ) {
			// Get the status (col 0 is the filename, col 1 is the status):
			FileStatus status = (FileStatus) model.getValueAt(i, 1);
			if ( status == FileStatus.NEEDS_UPDATE ) {
				docsNeedingUpdate.add( FileListModel.fileList.get(i).fileID );
			}
		}
		if ( docsNeedingUpdate.size() == 0 )
			return;  // All files up to date.

		// Remove old data for this doc, then re-add the file's data:
		for ( long docID : docsNeedingUpdate ) {
			removeDocData( docID );
			addDocData( docID );
			FileListModel.getModel().updateFileModTime( docID );
		}

		// Save updated index to file:
		model.saveIndexToFile();
	}

	/**
	 * Locate and read file data into the in-memory list and map.
	 * (See "FileFormat.txt" for a description.)  This method
	 * is also used to regenerate the map.
	 * This implementation is not robust; no error handling is present, so
	 * if the file is corrupted (not is the expected format), the error is
	 * not handled gracefully.  TODO:  Fix ASAP.
	 */
	static void initializeIndexFromFile() {

		// Determine absolute pathname of file in user's home directory:
		File indexFile = new File( Main.indexFileName );

		// Create and initialize new file if missing:
		if ( ! indexFile.exists() )
			try {
				indexFile.createNewFile();
				PrintWriter file = new PrintWriter( new OutputStreamWriter(
						new FileOutputStream( indexFile ), FILE_ENCODING ) );
				file.println( FILE_HEADER );  // File header and version
				file.println( "0" );  // Next FileID number to use
				file.println();  // An empty file still needs a separator
				file.close();
			}
		catch ( Exception e ) {  // Can't create UTF-8 file in user's home
			e.printStackTrace(); // Should never happen
		}

		// Open file and confirm format (initial line):
		Scanner file = null;
		try {
			file = new Scanner( indexFile, FILE_ENCODING );
			String header = file.nextLine();
			if ( ! header.equals( FILE_HEADER ) ) { // TODO: something sensible
				file.close();
				throw new RuntimeException( "Incorrect File format: "
			        + indexFile );
			}
			nextID = Long.parseLong( file.nextLine() );

			// Read in file list; assume file format is okay if we get to here:
			while ( file.hasNext() ) {
				String line = file.nextLine();  // ID TAB name TAB timestamp
				if ( line.length() == 0 ) // End of file list
					break;
				Scanner fileData = new Scanner( line );
				fileData.useDelimiter( "\t" );  // Set delimiter to TAB
				int id = fileData.nextInt();
				String name = fileData.next();
				long timestamp = fileData.nextLong();
				fileData.close();
				FileListModel.fileList.add( new FileItem( id, name, timestamp) );
			}

			// Read index data into map:
			while ( file.hasNext() ) {
				String line = file.nextLine();
				Scanner indexData = new Scanner( line );
				String word = indexData.next();

				// The following code is not robust; it doesn't allow for errors:
				SortedSet<DocPos> positions = new TreeSet<>();
				while ( indexData.hasNext() ) {
					String[] datum = indexData.next().split(",");
					positions.add(new DocPos(Long.parseLong(datum[0]),
						Integer.parseInt(datum[1]) ) );
				}
				indexData.close();
				IndexUtils.invertedIndex.put(word, positions);
			}

			file.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();  // Should never happen
		}
	}
}