Given a list of documents and a list of keywords, how do we find all documents containing one or multiple keywords? For this question, we tokenize all documents into terms, and create a inverted index where each term is associated with a list of documents that contains that term.
We use this information to rank documents based on relevance to search queries and present the result to end users. Inverted index is widely used in search engines, database systems and applications, and is especially efficient in handling large volume of documents.
# Define the documents
document1 = "The quick brown fox jumped over the lazy dog."
document2 = "The lazy dog slept in the sun."
# Step 1: Tokenize the documents
# Convert each document to lowercase and split it into words
tokens1 = document1.lower().split()
tokens2 = document2.lower().split()
# Combine the tokens into a list of unique terms
terms = list(set(tokens1 + tokens2))
# Step 2: Build the inverted index
# Create an empty dictionary to store the inverted index
inverted_index = {}
# For each term, find the documents that contain it
for term in terms:
documents = []
if term in tokens1:
documents.append("Document 1")
if term in tokens2:
documents.append("Document 2")
inverted_index[term] = documents
# Step 3: Print the inverted index
for term, documents in inverted_index.items():
print(term, "->", ", ".join(documents))
Further Thoughts: Find query’s LCP among tokens with trie
To find the longest common prefix among string set S = {S1, S2, …, Sn} and string q, we can use a trie. Use case here is that if we cannot find a given keyword q, we can search through all tokenized terms S and find a longest common prefix p, then return the list of documents associated with p.
class TrieNode:
def __init__(self):
self.children = {}
self.isEnd = False
class Trie:
def __init__(self):
self.root = TrieNode()
def searchLCP(self, word: str):
node = self.root
prefix = ""
for curr in word:
if curr in node.children and not node.isEnd:
prefix += curr
node = node.children[curr]
else:
return prefix
return prefix
def insert(self, word: str) -> None:
cur = self.root
# Insert character by character into trie
for c in word:
# if character path does not exist, create it
if c not in cur.children:
cur.children[c] = TrieNode()
cur = cur.children[c]
cur.isEnd = True
def search(self, word: str) -> bool:
cur = self.root
# Search character by character in trie
for c in word:
# if character path does not exist, return False
if c not in cur.children:
return False
cur = cur.children[c]
return cur.isEnd
def startsWith(self, prefix: str) -> bool:
# Same as search, except there is no isEnd condition at final return
cur = self.root
for c in prefix:
if c not in cur.children:
return False
cur = cur.children[c]
return True
def longestCommonPrefix(strs: List[str], q: str) -> str:
# Trie O(S) build and O(m) search, m = len(q), S(S) for trie
if not strs: return ""
if len(strs) == 1: return strs[0]
trie = Trie()
for i in range(1, len(strs)):
trie.insert(strs[i])
return trie.searchLCP(q)
print(longestCommonPrefix(["I", "have", "an", "apple"], "applewatch")) # "apple"
A complete solution combining inverted index and LCP:
Q: Give a list of documents and give a list of keywords, write two function:
# getOr – return the list of documents containing at least one keyword
# getAnd – return the list of documents containing all the keywords
import random
class TrieNode:
def __init__(self):
self.children = {}
self.isEnd = False
class Trie:
def __init__(self):
self.root = TrieNode()
def searchLCP(self, word: str):
node = self.root
prefix = ""
for curr in word:
if curr in node.children and not node.isEnd:
prefix += curr
node = node.children[curr]
else:
return prefix
return prefix
def insert(self, word: str) -> None:
cur = self.root
# Insert character by character into trie
for c in word:
# if character path does not exist, create it
if c not in cur.children:
cur.children[c] = TrieNode()
cur = cur.children[c]
cur.isEnd = True
def search(self, word: str) -> bool:
cur = self.root
# Search character by character in trie
for c in word:
# if character path does not exist, return False
if c not in cur.children:
return False
cur = cur.children[c]
return cur.isEnd
def startsWith(self, prefix: str) -> bool:
# Same as search, except there is no isEnd condition at final return
cur = self.root
for c in prefix:
if c not in cur.children:
return False
cur = cur.children[c]
return True
def longestCommonPrefix(strs: List[str], q: str) -> str:
# Trie O(S) build and O(m) search, m = len(q), S(S) for trie
if not strs: return ""
if len(strs) == 1: return strs[0]
trie = Trie()
for i in range(1, len(strs)):
trie.insert(strs[i])
return trie.searchLCP(q)
def getAnd(docs, keywords):
res = set()
terms = set()
tokens = [] # 2-D list, each item is a list of tokens from a doc
inverted_index = {}
# docs[i] would be matched with tokens[i]
for i in range(len(docs)):
tokens.append(docs[i].lower().split())
# collect all terms
for lst in tokens:
terms = terms.union(set(lst))
# create inverted index
for term in terms:
documents = []
for i in range(len(docs)):
if term in tokens[i]:
documents.append(i)
inverted_index[term] = documents
# search
for word in keywords:
if word in inverted_index:
res = res.union(set(inverted_index[word]))
return list(res)
def getOr(docs, keywords, num):
words = random.sample(keywords, num)
return getAnd(docs, words)
# Define the documents
document1 = "The quick brown fox jumped over the lazy dog"
document2 = "The lazy dog slept in the sun"
document3 = "I have a beautiful house by the beach and near the lighthouse"
document4 = "She is a great employee working on a significant project"
d = []
d.append(document1)
d.append(document2)
d.append(document3)
d.append(document4)
k = ["the", "quick", "fix", "I", "sample", "r"]
k2 = ["the"]
print(getOr(d, k, 3))
print(getAnd(d, k))
print(getAnd(d, [longestCommonPrefix(k2, "there")]))
