Skip to content

Commit cf14622

Browse files
author
tomdyson
committed
Initial support for multi-term queries
1 parent a817063 commit cf14622

File tree

1 file changed

+54
-6
lines changed

1 file changed

+54
-6
lines changed

dolphy.py

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import time
1212
import dbhash
1313
import marshal
14+
from sets import Set
1415

1516
STOPWORDS = '/Users/tomdyson/Documents/code/python/dolphy/data/stopwords.txt'
1617

@@ -98,19 +99,66 @@ def sortByDate(self, documents):
9899
results.sort()
99100
results.reverse()
100101
return results
102+
103+
def mergeMatches(self, doc_groups, merge_type="intersection"):
104+
""" Combine sets of matching documents, e.g. for each term
105+
in a multi-term query. Supports intersections (for AND
106+
queries) and unions (for ORs). """
107+
# TODO: less frequent terms (across all documents) should
108+
# be weighted - perhaps this is where to return the weighting
109+
# (or just the frequency)
110+
combined_set = {}
111+
doc_groups_copy = list(doc_groups)
112+
intersected = Set(doc_groups_copy.pop().keys())
113+
if merge_type == "intersection":
114+
for doc_group in doc_groups_copy:
115+
intersected = intersected.intersection(Set(doc_group.keys()))
116+
for doc in intersected:
117+
positions = []
118+
len = 0
119+
for doc_group in doc_groups:
120+
positions.extend(doc_group[doc][0])
121+
len = doc_group[doc][1] # should only have to get this once
122+
combined_set[doc] = (positions, len)
123+
elif merge_type == "union":
124+
for doc_group in doc_groups_copy:
125+
intersected = intersected.union(Set(doc_group.keys()))
126+
for doc in intersected:
127+
positions = []
128+
len = 0
129+
for doc_group in doc_groups:
130+
if doc in doc_group:
131+
positions.extend(doc_group[doc][0])
132+
len = doc_group[doc][1] # should only have to get this once
133+
combined_set[doc] = (positions, len)
134+
print combined_set
135+
return combined_set
101136

102-
def search(self, query, summarise='simple', page_start=1, page_size=10):
103-
"""Retrieve and sort documents containing the specified term"""
104-
query = query.lower()
137+
def search(self, query, summarise='simple', page_start=1, page_size=10, operator="AND"):
138+
"""Retrieve and sort documents containing the specified term(s)"""
139+
query_terms = query.lower().strip().split(' ')
105140
ret = []
106141
t = Text()
107142
porter = tokenize.PorterStemmer()
108-
stemmed_query = porter.stem(query)
109-
matching_documents = self.db.get('T_' + stemmed_query)
143+
if len(query_terms) > 1:
144+
matching_document_groups = []
145+
for query_term in query_terms:
146+
stemmed_query = porter.stem(query_term)
147+
matching_documents = self.db.get('T_' + query_term)
148+
if matching_documents:
149+
matching_document_groups.append(marshal.loads(matching_documents))
150+
# copy the list of matching document groups for sets
151+
if operator == "AND": join_type = "intersection"
152+
elif operator == "OR": join_type = "union"
153+
documents = self.mergeMatches(matching_document_groups, join_type)
154+
else:
155+
query_term = query_terms[0]
156+
stemmed_query = porter.stem(query_term)
157+
matching_documents = self.db.get('T_' + query_term)
158+
documents = marshal.loads(matching_documents)
110159
ret = {}
111160
ret['query'] = query
112161
if matching_documents:
113-
documents = marshal.loads(matching_documents)
114162
results = self.sort_by(documents)
115163
ret['count'] = len(results)
116164
ranked_documents = []

0 commit comments

Comments
 (0)