|
11 | 11 | import time
|
12 | 12 | import dbhash
|
13 | 13 | import marshal
|
| 14 | +from sets import Set |
14 | 15 |
|
15 | 16 | STOPWORDS = '/Users/tomdyson/Documents/code/python/dolphy/data/stopwords.txt'
|
16 | 17 |
|
@@ -98,19 +99,66 @@ def sortByDate(self, documents):
|
98 | 99 | results.sort()
|
99 | 100 | results.reverse()
|
100 | 101 | return results
|
| 102 | + |
| 103 | + def mergeMatches(self, doc_groups, merge_type="intersection"): |
| 104 | + """ Combine sets of matching documents, e.g. for each term |
| 105 | + in a multi-term query. Supports intersections (for AND |
| 106 | + queries) and unions (for ORs). """ |
| 107 | + # TODO: less frequent terms (across all documents) should |
| 108 | + # be weighted - perhaps this is where to return the weighting |
| 109 | + # (or just the frequency) |
| 110 | + combined_set = {} |
| 111 | + doc_groups_copy = list(doc_groups) |
| 112 | + intersected = Set(doc_groups_copy.pop().keys()) |
| 113 | + if merge_type == "intersection": |
| 114 | + for doc_group in doc_groups_copy: |
| 115 | + intersected = intersected.intersection(Set(doc_group.keys())) |
| 116 | + for doc in intersected: |
| 117 | + positions = [] |
| 118 | + len = 0 |
| 119 | + for doc_group in doc_groups: |
| 120 | + positions.extend(doc_group[doc][0]) |
| 121 | + len = doc_group[doc][1] # should only have to get this once |
| 122 | + combined_set[doc] = (positions, len) |
| 123 | + elif merge_type == "union": |
| 124 | + for doc_group in doc_groups_copy: |
| 125 | + intersected = intersected.union(Set(doc_group.keys())) |
| 126 | + for doc in intersected: |
| 127 | + positions = [] |
| 128 | + len = 0 |
| 129 | + for doc_group in doc_groups: |
| 130 | + if doc in doc_group: |
| 131 | + positions.extend(doc_group[doc][0]) |
| 132 | + len = doc_group[doc][1] # should only have to get this once |
| 133 | + combined_set[doc] = (positions, len) |
| 134 | + print combined_set |
| 135 | + return combined_set |
101 | 136 |
|
102 |
| - def search(self, query, summarise='simple', page_start=1, page_size=10): |
103 |
| - """Retrieve and sort documents containing the specified term""" |
104 |
| - query = query.lower() |
| 137 | + def search(self, query, summarise='simple', page_start=1, page_size=10, operator="AND"): |
| 138 | + """Retrieve and sort documents containing the specified term(s)""" |
| 139 | + query_terms = query.lower().strip().split(' ') |
105 | 140 | ret = []
|
106 | 141 | t = Text()
|
107 | 142 | porter = tokenize.PorterStemmer()
|
108 |
| - stemmed_query = porter.stem(query) |
109 |
| - matching_documents = self.db.get('T_' + stemmed_query) |
| 143 | + if len(query_terms) > 1: |
| 144 | + matching_document_groups = [] |
| 145 | + for query_term in query_terms: |
| 146 | + stemmed_query = porter.stem(query_term) |
| 147 | + matching_documents = self.db.get('T_' + query_term) |
| 148 | + if matching_documents: |
| 149 | + matching_document_groups.append(marshal.loads(matching_documents)) |
| 150 | + # copy the list of matching document groups for sets |
| 151 | + if operator == "AND": join_type = "intersection" |
| 152 | + elif operator == "OR": join_type = "union" |
| 153 | + documents = self.mergeMatches(matching_document_groups, join_type) |
| 154 | + else: |
| 155 | + query_term = query_terms[0] |
| 156 | + stemmed_query = porter.stem(query_term) |
| 157 | + matching_documents = self.db.get('T_' + query_term) |
| 158 | + documents = marshal.loads(matching_documents) |
110 | 159 | ret = {}
|
111 | 160 | ret['query'] = query
|
112 | 161 | if matching_documents:
|
113 |
| - documents = marshal.loads(matching_documents) |
114 | 162 | results = self.sort_by(documents)
|
115 | 163 | ret['count'] = len(results)
|
116 | 164 | ranked_documents = []
|
|
0 commit comments