BLOG | Inverted index: search keywords in a list of documents

Given a list of documents and a list of keywords, how do we find all documents containing one or multiple keywords? For this question, we tokenize all documents into terms, and create a inverted index where each term is associated with a list of documents that contains that term.

We use this information to rank documents based on relevance to search queries and present the result to end users. Inverted index is widely used in search engines, database systems and applications, and is especially efficient in handling large volume of documents.

# Define the documents
document1 = "The quick brown fox jumped over the lazy dog."
document2 = "The lazy dog slept in the sun."

# Step 1: Tokenize the documents
# Convert each document to lowercase and split it into words
tokens1 = document1.lower().split()
tokens2 = document2.lower().split() 

# Combine the tokens into a list of unique terms
terms = list(set(tokens1 + tokens2))

# Step 2: Build the inverted index
# Create an empty dictionary to store the inverted index
inverted_index = {}

# For each term, find the documents that contain it
for term in terms:
	documents = []
	if term in tokens1:
		documents.append("Document 1")
	if term in tokens2:
		documents.append("Document 2")
	inverted_index[term] = documents

# Step 3: Print the inverted index
for term, documents in inverted_index.items():
	print(term, "->", ", ".join(documents))

Further Thoughts: Find query’s LCP among tokens with trie

To find the longest common prefix among string set S = {S1, S2, …, Sn} and string q, we can use a trie. Use case here is that if we cannot find a given keyword q, we can search through all tokenized terms S and find a longest common prefix p, then return the list of documents associated with p.

class TrieNode:
    def __init__(self):
        self.children = {}
        self.isEnd = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def searchLCP(self, word: str):
        node = self.root
        prefix = ""
        for curr in word:
            if curr in node.children and not node.isEnd:
                prefix += curr
                node = node.children[curr]
            else:
                return prefix
        return prefix

    def insert(self, word: str) -> None:
        cur = self.root
        # Insert character by character into trie
        for c in word:
            # if character path does not exist, create it
            if c not in cur.children:
                cur.children[c] = TrieNode()
            cur = cur.children[c]
        cur.isEnd = True
        

    def search(self, word: str) -> bool:
        cur = self.root
        # Search character by character in trie
        for c in word:
            # if character path does not exist, return False
            if c not in cur.children:
                return False
            cur = cur.children[c]
        return cur.isEnd
        

    def startsWith(self, prefix: str) -> bool:
        # Same as search, except there is no isEnd condition at final return
        cur = self.root
        for c in prefix:
            if c not in cur.children:
                return False
            cur = cur.children[c]
        return True

def longestCommonPrefix(strs: List[str], q: str) -> str:
    # Trie O(S) build and O(m) search, m = len(q), S(S) for trie 
    if not strs: return ""
    if len(strs) == 1: return strs[0]
    trie = Trie()
    for i in range(1, len(strs)):
        trie.insert(strs[i])

    return trie.searchLCP(q)
print(longestCommonPrefix(["I", "have", "an", "apple"], "applewatch"))    # "apple"

A complete solution combining inverted index and LCP:

Q: Give a list of documents and give a list of keywords, write two function:
# getOr – return the list of documents containing at least one keyword
# getAnd – return the list of documents containing all the keywords

import random

class TrieNode:
    def __init__(self):
        self.children = {}
        self.isEnd = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def searchLCP(self, word: str):
        node = self.root
        prefix = ""
        for curr in word:
            if curr in node.children and not node.isEnd:
                prefix += curr
                node = node.children[curr]
            else:
                return prefix
        return prefix

    def insert(self, word: str) -> None:
        cur = self.root
        # Insert character by character into trie
        for c in word:
            # if character path does not exist, create it
            if c not in cur.children:
                cur.children[c] = TrieNode()
            cur = cur.children[c]
        cur.isEnd = True
        

    def search(self, word: str) -> bool:
        cur = self.root
        # Search character by character in trie
        for c in word:
            # if character path does not exist, return False
            if c not in cur.children:
                return False
            cur = cur.children[c]
        return cur.isEnd
        

    def startsWith(self, prefix: str) -> bool:
        # Same as search, except there is no isEnd condition at final return
        cur = self.root
        for c in prefix:
            if c not in cur.children:
                return False
            cur = cur.children[c]
        return True

def longestCommonPrefix(strs: List[str], q: str) -> str:
    # Trie O(S) build and O(m) search, m = len(q), S(S) for trie 
    if not strs: return ""
    if len(strs) == 1: return strs[0]
    trie = Trie()
    for i in range(1, len(strs)):
        trie.insert(strs[i])

    return trie.searchLCP(q)

def getAnd(docs, keywords):
    res = set()
    terms = set()
    tokens = []    # 2-D list, each item is a list of tokens from a doc
    inverted_index = {}
    # docs[i] would be matched with tokens[i]
    for i in range(len(docs)):
        tokens.append(docs[i].lower().split())

    # collect all terms
    for lst in tokens:
        terms = terms.union(set(lst))

    # create inverted index
    for term in terms:
        documents = []
        for i in range(len(docs)):
            if term in tokens[i]:
                documents.append(i)
        inverted_index[term] = documents
    
    # search 
    for word in keywords:
        if word in inverted_index:
            res = res.union(set(inverted_index[word]))
    return list(res)

def getOr(docs, keywords, num):
    words = random.sample(keywords, num)
    return getAnd(docs, words)

# Define the documents
document1 = "The quick brown fox jumped over the lazy dog"
document2 = "The lazy dog slept in the sun"
document3 = "I have a beautiful house by the beach and near the lighthouse"
document4 = "She is a great employee working on a significant project"

d = []
d.append(document1)
d.append(document2)
d.append(document3)
d.append(document4)
k = ["the", "quick", "fix", "I", "sample", "r"]
k2 = ["the"]
print(getOr(d, k, 3))
print(getAnd(d, k))
print(getAnd(d, [longestCommonPrefix(k2, "there")]))

Resources:

Leave a comment