import py import fcntl class mindex(object): """very simple and naive fulltext indexer stores data in a huge directory structure, where each character in each word encountered has its own directory (along with any word that starts with the same characters up to and including the character) indexing 'foo' and 'frob' will generate the following dir structure:: /f/o/o/data /r/o/b/data creating a dir this way makes that searching is very fast, but indexing is relatively slow and creates a large dir supported are simple searches on multiple words, using 'and' or 'or behaviour, not supported is anything else (so no wildcards, etc.) """ def __init__(self, path, charset='UTF-8'): self.charset = charset if type(path) == str: path = py.path.local(path) self.path = path self.path.ensure(dir=True) self.wordspath = path / 'words' self.wordspath.ensure(dir=True) self.revpath = path / 'reverse' self.revpath.ensure(dir=True) def index(self, path, data): """index a document we have both a dir structure for fulltext search where each character has it's own directory (so 'foo' will result in directory path 'f/o/o') and there's a mapping path to words (for easy removal and such) unindexes if the path is already registered as indexed """ if not isinstance(data, unicode): data = unicode(str(data), self.charset) data = data.replace('\n', ' ').replace('\r', ' ') # first isolate all words, and find out how many times they occur wordcounts = self._countwords(data) # now store the words in a file so it's easy to find out what indexes # were created on removal revpath = self.revpath / path if revpath.check(): # there's already a file on this path, unindex self.unindex(path) revpath.ensure() revfp = revpath.open() try: fcntl.flock(revfp, fcntl.LOCK_EX) try: revpath.write(' '.join(wordcounts).encode(self.charset)) # now store a reference to the path for each word encountered, # create a dir for each character in each word, # and register the path and occurrence for word, count in wordcounts.iteritems(): pathstring = self._makepath(word) dpath = self.wordspath / pathstring dpath.ensure(dir=True) paths = dpath / 'paths' paths.ensure() pfp = paths.open('a') try: fcntl.flock(pfp, fcntl.LOCK_EX) try: pfp.write('%s %s\n' % (path, count)) pfp.flush() finally: fcntl.flock(pfp, fcntl.LOCK_UN) finally: pfp.close() finally: fcntl.flock(revfp, fcntl.LOCK_UN) finally: revfp.close() def unindex(self, path, recurse=False): """unindex a path is called on index if the path is already indexed """ revpath = self.revpath / path if recurse and revpath.check(dir=True): for fpath in revpath.visit(): if fpath.check(file=True): relpath = fpath.relto(revpath) childpath = '%s/%s' % (path, relpath) self.unindex(childpath) return elif not revpath.check(file=True): return revfp = revpath.open() try: fcntl.flock(revfp, fcntl.LOCK_EX) try: words = unicode(revpath.read(), self.charset).split(' ') for word in words: pathstring = self._makepath(word) dpath = self.wordspath / pathstring / 'paths' if not dpath.check(): continue paths = dpath.readlines() newpaths = [x.strip() for x in paths if x and ' '.join(x.strip().split(' ')[:-1]) != path] dfp = dpath.open() try: fcntl.flock(dfp, fcntl.LOCK_EX) try: if newpaths: dpath.write('\n'.join(newpaths) + '\n') else: ddpath = dpath.dirpath() dpath.remove() while not ddpath.listdir(): if str(ddpath) == str(self.wordspath): break dpath = ddpath ddpath = dpath.dirpath() dpath.remove() finally: fcntl.flock(dfp, fcntl.LOCK_UN) finally: dfp.close() revpath.remove() finally: fcntl.flock(revfp, fcntl.LOCK_UN) finally: revfp.close() def search(self, words, andsearch=True): """returns all paths that contain all words ordered by occurrence count """ words = [unicode(word, self.charset) for word in words] ret = {} andset = None for word in words: result = self._search_word(word) for path, count in result: ret[path] = ret.get(path, 0) + int(count) currset = set(x[0] for x in result) if andset is None: andset = currset else: andset = andset.intersection(currset) ret = ret.items() ret.sort(lambda a, b: -cmp(a[1], b[1])) ret = [x[0] for x in ret] if andsearch: ret = [x for x in ret if x in andset] return ret def _search_word(self, word): result = [] if word.endswith('*'): paths = self._find_subpaths(word[:-1]) else: paths = [self.wordspath / self._makepath(word) / 'paths'] for datapath in paths: if datapath.check(): for line in [line.strip() for line in datapath.readlines()]: if line: result.append(line.split(' ', 1)) return result def _find_subpaths(self, word): basepath = self.wordspath / self._makepath(word) if basepath.check(): for path in basepath.visit(): if path.basename == 'paths': yield path reg = py.std.re.compile('\w*', py.std.re.U) def _countwords(self, data): ret = {} for word in (s.lower() for s in self.reg.findall(data) if s): ret[word] = ret.get(word, 0) + 1 return ret def _makepath(self, word): p = [] for c in word: o = ord(c) if o > 128: c = o p.append(str(c)) return '/'.join(p) if __name__ == '__main__': q = mindex('/tmp/qs') q.index('/foo', 'one two three') print repr(q.search('two'))