import py
import fcntl

class mindex(object):
    """very simple and naive fulltext indexer

        stores data in a huge directory structure, where each character in
        each word encountered has its own directory (along with any word
        that starts with the same characters up to and including the character)
        
        indexing 'foo' and 'frob' will generate the following dir structure::

          /f/o/o/data
            /r/o/b/data

        creating a dir this way makes that searching is very fast, but indexing
        is relatively slow and creates a large dir

        supported are simple searches on multiple words, using 'and' or 'or
        behaviour, not supported is anything else (so no wildcards, etc.)
    """

    def __init__(self, path, charset='UTF-8'):
        self.charset = charset
    
        if type(path) == str:
            path = py.path.local(path)
        self.path = path
        self.path.ensure(dir=True)
        
        self.wordspath = path / 'words'
        self.wordspath.ensure(dir=True)

        self.revpath = path / 'reverse'
        self.revpath.ensure(dir=True)
    
    def index(self, path, data):
        """index a document

            we have both a dir structure for fulltext search where each 
            character has it's own directory (so 'foo' will result in 
            directory path 'f/o/o') and there's a mapping path to words
            (for easy removal and such)

            unindexes if the path is already registered as indexed
        """
        data = unicode(data, self.charset)
        data = data.replace('\n', ' ').replace('\r', ' ')
        # first isolate all words, and find out how many times they occur
        wordcounts = self._countwords(data)
        # now store the words in a file so it's easy to find out what indexes
        # were created on removal
        revpath = self.revpath / path
        if revpath.check():
            # there's already a file on this path, unindex
            self.unindex(path)
        revpath.ensure()
        revfp = revpath.open()
        try:
            fcntl.flock(revfp, fcntl.LOCK_EX)
            try:
                revpath.write(' '.join(wordcounts).encode(self.charset))
                # now store a reference to the path for each word encountered,
                # create a dir for each character in each word,
                # and register the path and occurrence
                for word, count in wordcounts.iteritems():
                    pathstring = self._makepath(word)
                    dpath = self.wordspath / pathstring
                    dpath.ensure(dir=True)
                    paths = dpath / 'paths'
                    paths.ensure()
                    pfp = paths.open('a')
                    try:
                        fcntl.flock(pfp, fcntl.LOCK_EX)
                        try:
                            pfp.write('%s %s\n' % (path, count))
                            pfp.flush()
                        finally:
                            fcntl.flock(pfp, fcntl.LOCK_UN)
                    finally:
                        pfp.close()
            finally:
                fcntl.flock(revfp, fcntl.LOCK_UN)
        finally:
            revfp.close()

    def unindex(self, path):
        """unindex a path

            is called on index if the path is already indexed
        """
        revpath = self.revpath / path
        if not revpath.check():
            return
        revfp = revpath.open()
        try:
            fcntl.flock(revfp, fcntl.LOCK_EX)
            try:
                words = unicode(revpath.read(), self.charset).split(' ')
                for word in words:
                    pathstring = self._makepath(word)
                    dpath = self.wordspath / pathstring / 'paths'
                    if not dpath.check():
                        continue
                    paths = dpath.readlines()
                    newpaths = [x.strip() for x in paths if x and 
                                ' '.join(x.strip().split(' ')[:-1]) != path]
                    dfp = dpath.open()
                    try:
                        fcntl.flock(dfp, fcntl.LOCK_EX)
                        try:
                            if newpaths:
                                dpath.write('\n'.join(newpaths) + '\n')
                            else:
                                ddpath = dpath.dirpath()
                                dpath.remove()
                                while not ddpath.listdir():
                                    if str(ddpath) == str(self.wordspath):
                                        break
                                    dpath = ddpath
                                    ddpath = dpath.dirpath()
                                    dpath.remove()
                        finally:
                            fcntl.flock(dfp, fcntl.LOCK_UN)
                    finally:
                        dfp.close()
                revpath.remove()
            finally:
                fcntl.flock(revfp, fcntl.LOCK_UN)
        finally:
            revfp.close()

    def search(self, words, andsearch=True):
        """returns all paths that contain all words
        
            ordered by occurrence count
        """
        words = [unicode(word, self.charset) for word in words]
        ret = {}
        andset = None
        for word in words:
            result = self._search_word(word)
            for path, count in result:
                ret[path] = ret.get(path, 0) + int(count)
            currset = set(x[0] for x in result)
            if andset is None:
                andset = currset
            else:
                andset = andset.intersection(currset)
        ret = ret.items()
        ret.sort(lambda a, b: -cmp(a[1], b[1]))
        ret = [x[0] for x in ret]
        if andsearch:
            ret = [x for x in ret if x in andset]
        return ret

    def _search_word(self, word):
        result = []
        if word.endswith('*'):
            paths = self._find_subpaths(word[:-1])
        else:
            paths = [self.wordspath / self._makepath(word) / 'paths']
        for datapath in paths:
            if datapath.check():
                for line in [line.strip() for line in datapath.readlines()]:
                    if line:
                        result.append(line.split(' ', 1))
        return result

    def _find_subpaths(self, word):
        basepath = self.wordspath / self._makepath(word)
        if basepath.check():
            for path in basepath.visit():
                if path.basename == 'paths':
                    yield path

    reg = py.std.re.compile('\w*', py.std.re.U)
    def _countwords(self, data):
        ret = {}
        for word in (s.lower() for s in self.reg.findall(data) if s):
            ret[word] = ret.get(word, 0) + 1
        return ret

    def _makepath(self, word):
        p = []
        for c in word:
            o = ord(c)
            if o > 128:
                c = o
            p.append(str(c))
        return '/'.join(p)

if __name__ == '__main__':
    q = mindex('/tmp/qs')
    q.index('/foo', 'one two three')
    print repr(q.search('two'))


