Skip to content

Commit 0bcf3e5

Browse files
committed
Added SparseDoubleVector (with DIMSUM downsampling)
1 parent 0d2e266 commit 0bcf3e5

File tree

4 files changed

+463
-0
lines changed

4 files changed

+463
-0
lines changed

src/main/java/info/debatty/java/stringsimilarity/KShingling.java

+11
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,15 @@ public StringSet getSet(String s) {
152152
// Convert hashmap to sparsearray
153153
return new StringSet(new SparseBooleanVector(hash_profile), this);
154154
}
155+
156+
/**
157+
* Return the number of different n-grams (k-shingles) found by this
158+
* k-shingling instance.
159+
* @return
160+
*/
161+
public int getDimension() {
162+
return this.shingles.size();
163+
}
155164

156165
private HashMap<Integer, Integer> getHashProfile(String s) {
157166
HashMap<Integer, Integer> hash_profile = new HashMap<Integer, Integer>(s.length());
@@ -183,4 +192,6 @@ private HashMap<Integer, Integer> getHashProfile(String s) {
183192
}
184193

185194

195+
196+
186197
}

src/main/java/info/debatty/java/stringsimilarity/StringProfile.java

+4
Original file line numberDiff line numberDiff line change
@@ -67,4 +67,8 @@ public double qgramDistance(StringProfile other) throws Exception {
6767

6868
return this.vector.qgram(other.vector);
6969
}
70+
71+
public SparseIntegerVector getSparseVector() {
72+
return this.vector;
73+
}
7074
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright 2015 tibo.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
25+
package info.debatty.java.stringsimilarity.examples;
26+
27+
import info.debatty.java.utils.SparseDoubleVector;
28+
import java.util.Random;
29+
30+
/**
31+
*
32+
* @author tibo
33+
*/
34+
public class SparseDoubleVectorExample {
35+
36+
/**
37+
* @param args the command line arguments
38+
*/
39+
public static void main(String[] args) {
40+
41+
int count = 400;
42+
int size = 1000;
43+
double threshold = 0.7;
44+
45+
System.out.println("Create some random SparseDoubleVector...");
46+
Random r = new Random();
47+
SparseDoubleVector[] data = new SparseDoubleVector[count];
48+
for (int i = 0; i < count; i++) {
49+
double[] v = new double[size];
50+
for (int j = 0; j < size; j++) {
51+
v[j] = r.nextDouble();
52+
}
53+
data[i] = new SparseDoubleVector(v);
54+
}
55+
56+
57+
System.out.println("Compute real similarities...");
58+
double[][] real_similarities = new double[count][count];
59+
for (int i = 0; i < count; i++) {
60+
for (int j = 0; j < i; j++) {
61+
real_similarities[i][j] = data[i].cosineSimilarity(data[j]);
62+
}
63+
}
64+
65+
66+
System.out.println("Downsample the vectors using DIMSUM algorithm...");
67+
for (int i = 0; i < count; i++) {
68+
try {
69+
data[i].sampleDIMSUM(threshold, count, size);
70+
} catch(Exception ex) {
71+
System.out.println(ex.getMessage());
72+
}
73+
}
74+
75+
System.out.println("Compute estimated similarities...");
76+
int above_threshold = 0;
77+
int correct = 0;
78+
for (int i = 0; i < count; i++) {
79+
for (int j = 0; j < i; j++) {
80+
81+
double sim = data[i].cosineSimilarity(data[j]);
82+
83+
if (real_similarities[i][j] >= threshold) {
84+
above_threshold++;
85+
86+
if (Math.abs(real_similarities[i][j] - sim) / real_similarities[i][j] < 0.2) {
87+
correct++;
88+
}
89+
}
90+
}
91+
}
92+
System.out.println("Above threshold: " + above_threshold);
93+
System.out.println("Correct (max relative error 20%)" + correct);
94+
System.out.println("(" + Math.round(100.0 * correct / above_threshold) + "%)");
95+
96+
}
97+
}

0 commit comments

Comments
 (0)