/home/wpollock1/public_html/restricted/Java2/SearchEngine/com/wpollock/searchengine/IndexUtils.java
package com.wpollock.searchengine;
import java.awt.FileDialog;
import java.io.*;
import java.util.*;
import javax.swing.JOptionPane;
import javax.swing.JTextPane;
import static com.wpollock.searchengine.Main.*;
/**
* To keep each file small and focused, I put the file read/write code
* and the searching code in this class, instead of Main.
*
* @author wpollock
*/
class IndexUtils {
// The three search methods display the files, one per line, that
// match the search terms. The resulting JTextPane is selectable, but
// not editable. That allows users to select and copy filenames.
static Map<String, SortedSet<DocPos>> invertedIndex = new HashMap<>();
/** Does an AND search. The idea is to start with a set of all document
* IDs in the index. Then for each term, remove any documents that don't
* match. The resulting final set is the documents matching all search
* terms. (If user inputs no words, e.g. " ,. ", then all docs are
* considered as matching.)
*
* @param searchTerms A String of the user-supplied search terms.
* @param resultsPane The TextComponent to display the results in.
*/
static void doAndSearch( String searchTerms, JTextPane resultsPane ) {
Set<Long> files = FileListModel.getModel().getAllDocIDs();
// split into words on any non-letter, non-digit, and non-dash:
String[] words = searchTerms.split( "[^a-zA-Z0-9-]+" );
// (Note, if user input contains no words, all docs match!)
for ( String word : words ) {
if ( files.size() == 0 ) break; // No files left to examine.
Set<DocPos> positions = invertedIndex.get( word.toLowerCase() );
if ( positions == null ) {
files.clear(); // This word is not in index, so the
break; // AND search produced no matching files.
}
// Build the set of docIDs that contain the search term:
Set<Long> filesContainingWord = new TreeSet<>();
for ( DocPos position : positions )
filesContainingWord.add( position.docID );
// Remove any docIDs from files that don't contain the search term:
files.retainAll( filesContainingWord ); // Not portable!
}
// Convert the final set of matching docIDs to filenames:
StringBuilder results = new StringBuilder();
// Add appropriate title to results:
if ( files.size() > 0 )
results.append(" Matching Files:\n" );
else
results.append(" No Matching Files found." );
// Add each file name found to the results:
for ( Long docID : files )
results.append( "\n" + FileListModel.getModel().getFileName(docID) );
resultsPane.setText( results.toString() );
}
/** Does an OR search.
* The general idea is to start with an empty set of matching files, and
* add to it for each search term that is found in the index. The result
* is the OR search results. (If user inputs no words, e.g. " ,. ", then
* no docs are considered as matching.)
*
* @param searchTerms A String of the user-supplied search terms.
* @param resultsPane The TextComponent to display the results in.
*/
static void doOrSearch( String searchTerms, JTextPane resultsPane ) {
Set<Long> files = new TreeSet<>(); // Set of matching docIDs
// split into words on any non-letter, non-digit, and non-dash:
String[] words = searchTerms.split( "[^a-zA-Z0-9-]+" );
for ( String word : words ) {
Set<DocPos> positions = invertedIndex.get( word.toLowerCase() );
if ( positions == null ) {
continue; // This word is not in index.
}
for ( DocPos position : positions ) {
files.add( position.docID ); // Add the file ID to a set
}
}
// Convert the final set of matching docIDs to filenames:
StringBuilder results = new StringBuilder();
// Add appropriate title to results:
if ( files.size() > 0 )
results.append(" Matching Files:\n" );
else
results.append(" No Matching Files found." );
// Add each file name found to the results:
for ( Long docID : files )
results.append( "\n" + FileListModel.getModel().getFileName(docID) );
resultsPane.setText( results.toString() );
}
/** Does a PHRASE search.
* The general algorithm is: build a set of DocPos objects for the
* documents that contain the first word. Then for each remaining
* word in the phrase, check each DocPos in the set to see if the word
* is in the same document, in the next position. If not, remove that
* DocPos from the set. Repeat until set is empty (so phrase not found),
* or until the last word of the phrase has been processed.
*
* @param searchTerms A String of the user-supplied search terms.
* @param resultsPane The TextComponent to display the results in.
*/
static void doPhraseSearch( String searchTerms, JTextPane resultsPane ) {
Set<DocPos> filesContainingPhraseSoFar = new TreeSet<>();
// Split into words on any non-letter, non-digit, and non-dash:
String[] words = searchTerms.toLowerCase().split( "[^a-z0-9-]+" );
// Add all the DocPos objects for the first word to the set:
if ( words.length > 0 ) {
filesContainingPhraseSoFar.addAll( invertedIndex.get( words[0] ) );
// Remove the first word from the phrase:
words = Arrays.copyOfRange(words, 1, words.length );
}
for ( String word : words ) {
if ( filesContainingPhraseSoFar.size() == 0 )
break; // No files left to examine.
// Find all docs (and positions) of next word in phrase:
Set<DocPos> positions = invertedIndex.get( word );
if ( positions == null ) {
filesContainingPhraseSoFar.clear(); // Word is not in index,
break; // so the phrase isn't found.
}
// Remove any candidate if the phrase so far isn't followed by word:
Set<DocPos> newPhraseEnds = new TreeSet<>();
Iterator<DocPos> it = filesContainingPhraseSoFar.iterator();
while ( it.hasNext() ) {
DocPos lastWordinPhraseSoFar = it.next();
// The next word must be in that document, next position:
DocPos nextWord = new DocPos( lastWordinPhraseSoFar.docID,
lastWordinPhraseSoFar.pos + 1 );
if ( positions.contains(nextWord) )
newPhraseEnds.add(nextWord);
}
filesContainingPhraseSoFar = newPhraseEnds;
}
// Convert the final set of matching docIDs to filenames:
StringBuilder results = new StringBuilder();
// Add appropriate title to results:
if ( filesContainingPhraseSoFar.size() > 0 )
results.append(" Matching Files:\n" );
else
results.append(" No Matching Files found." );
// Add each file name found to the results:
Set<Long> docsContainingPhrase = new HashSet<>();
for ( DocPos dp : filesContainingPhraseSoFar ) {
docsContainingPhrase.add( dp.docID );
}
FileListModel model = FileListModel.getModel();
for ( long docID : docsContainingPhrase )
results.append( "\n" + model.getFileName(docID) );
resultsPane.setText( results.toString() );
}
/** Adds all the words in a user-selected file to the index.
* Words consist of letters, digits, and hyphens only. Before
* adding to the inverted index, the word is coerced to lowercase.
*/
static void addFileToIndex() {
// Have user select a file to be indexed:
String lastDir = prefs.get(LAST_DIRECTORY, "." );
// Use AWT for the file dialog, to get native L&F:
FileDialog fd = new FileDialog(Main.maintenanceWindow,
"Select file to index", FileDialog.LOAD );
fd.setDirectory(lastDir);
fd.setIconImage(Main.appIcon.getImage()); // TODO: fix this.
fd.setVisible(true);
File[] selectedFiles = fd.getFiles();
if ( selectedFiles.length == 0 )
return; // User cancelled, so nothing to do.
File file = selectedFiles[0]; // single file selection mode.
// Validate file can be read:
if ( ! file.exists() || ! file.isFile() || ! file.canRead() ) {
JOptionPane.showMessageDialog(null,
"File \"" + file.getName() + "\" can't be indexed!" );
return;
}
lastDir = file.getParent();
prefs.put( LAST_DIRECTORY, lastDir ); // Save last used directory
// If file is already indexed, done; else add to filelist:
String fileName = null;
try { fileName = file.getCanonicalPath(); // Unique absolute pathname.
} catch (IOException ioe) { ioe.printStackTrace(); }
FileListModel model = FileListModel.getModel();
if ( model.contains( fileName ) )
return; // File already indexed, so nothing to do.
long docID = model.addFile( fileName );
addDocData ( docID );
// Save updated index to file:
model.saveIndexToFile();
}
static void removeSelectedFilesFromIndex() {
FileListModel model = FileListModel.getModel();
Set<Long> selectedFiles = model.getSelectedFiles();
if ( selectedFiles.size() == 0 )
return; // No files selected.
// Use JOptionPane to confirm, then remove each one at a time:
String filesToRemove = "";
for ( long docID : selectedFiles ) {
filesToRemove += model.getFileName(docID)
+ "\n";
}
int status = JOptionPane.showConfirmDialog(maintenanceWindow,
"Remove these files from the index:\n" + filesToRemove );
if ( status != JOptionPane.OK_OPTION )
return;
for ( long docID : selectedFiles ) {
removeDocData( docID );
// Remove file from model (the JTable):
model.removeFile( docID );
}
// Save updated index to file:
model.saveIndexToFile();
}
static void removeDocData ( long docID ) {
Set<String> keys = new HashSet<>(invertedIndex.keySet() );
// Remove every DocPos from the index that refers to docID:
for ( String word : keys ) {
Set<DocPos> data = invertedIndex.get(word);
Iterator<DocPos> it = data.iterator();
while ( it.hasNext() ) {
DocPos dp = it.next();
if ( dp.docID == docID ) {
it.remove();
}
}
// Delete word from index if no remaining documents contain it:
if ( data.size() == 0 )
invertedIndex.remove(word);
}
}
static void addDocData ( long docID ) {
// Add each word in the file to index (convert to lowercase first):
Scanner in = null;
try {
in = new Scanner(
new File( FileListModel.getModel().getFileName(docID) ) );
int position = 0;
while ( in.hasNext() ) {
String word = in.next().toLowerCase();
// Ignore non-words:
word = word.replaceAll( "[^a-zA-Z0-9-]+", "" );
if ( word.length() == 0 ) continue;
DocPos dp = new DocPos( docID, position );
if ( ! invertedIndex.containsKey(word)) {
invertedIndex.put( word, new TreeSet<DocPos>() );
}
Set<DocPos> docPosForWord = invertedIndex.get( word );
docPosForWord.add( dp );
++position;
}
}
catch ( Exception e ) { e.printStackTrace(); }
finally {
if ( in != null )
in.close();
}
}
static void updateIndex() {
FileListModel model = FileListModel.getModel();
int numFiles = model.getRowCount();
if ( numFiles == 0 )
return; // No files in index
Set<Long> docsNeedingUpdate = new HashSet<>();
for ( int i = 0; i < numFiles; ++i ) {
// Get the status (col 0 is the filename, col 1 is the status):
FileStatus status = (FileStatus) model.getValueAt(i, 1);
if ( status == FileStatus.NEEDS_UPDATE ) {
docsNeedingUpdate.add( FileListModel.fileList.get(i).fileID );
}
}
if ( docsNeedingUpdate.size() == 0 )
return; // All files up to date.
// Remove old data for this doc, then re-add the file's data:
for ( long docID : docsNeedingUpdate ) {
removeDocData( docID );
addDocData( docID );
FileListModel.getModel().updateFileModTime( docID );
}
// Save updated index to file:
model.saveIndexToFile();
}
/**
* Locate and read file data into the in-memory list and map.
* (See "FileFormat.txt" for a description.) This method
* is also used to regenerate the map.
* This implementation is not robust; no error handling is present, so
* if the file is corrupted (not is the expected format), the error is
* not handled gracefully. TODO: Fix ASAP.
*/
static void initializeIndexFromFile() {
// Determine absolute pathname of file in user's home directory:
File indexFile = new File( Main.indexFileName );
// Create and initialize new file if missing:
if ( ! indexFile.exists() )
try {
indexFile.createNewFile();
PrintWriter file = new PrintWriter( new OutputStreamWriter(
new FileOutputStream( indexFile ), FILE_ENCODING ) );
file.println( FILE_HEADER ); // File header and version
file.println( "0" ); // Next FileID number to use
file.println(); // An empty file still needs a separator
file.close();
}
catch ( Exception e ) { // Can't create UTF-8 file in user's home
e.printStackTrace(); // Should never happen
}
// Open file and confirm format (initial line):
Scanner file = null;
try {
file = new Scanner( indexFile, FILE_ENCODING );
String header = file.nextLine();
if ( ! header.equals( FILE_HEADER ) ) { // TODO: something sensible
file.close();
throw new RuntimeException( "Incorrect File format: "
+ indexFile );
}
nextID = Long.parseLong( file.nextLine() );
// Read in file list; assume file format is okay if we get to here:
while ( file.hasNext() ) {
String line = file.nextLine(); // ID TAB name TAB timestamp
if ( line.length() == 0 ) // End of file list
break;
Scanner fileData = new Scanner( line );
fileData.useDelimiter( "\t" ); // Set delimiter to TAB
int id = fileData.nextInt();
String name = fileData.next();
long timestamp = fileData.nextLong();
fileData.close();
FileListModel.fileList.add( new FileItem( id, name, timestamp) );
}
// Read index data into map:
while ( file.hasNext() ) {
String line = file.nextLine();
Scanner indexData = new Scanner( line );
String word = indexData.next();
// The following code is not robust; it doesn't allow for errors:
SortedSet<DocPos> positions = new TreeSet<>();
while ( indexData.hasNext() ) {
String[] datum = indexData.next().split(",");
positions.add(new DocPos(Long.parseLong(datum[0]),
Integer.parseInt(datum[1]) ) );
}
indexData.close();
IndexUtils.invertedIndex.put(word, positions);
}
file.close();
} catch (FileNotFoundException e) {
e.printStackTrace(); // Should never happen
}
}
}