Skip to content

Instantly share code, notes, and snippets.

@jsuchal
Created July 17, 2013 14:26
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jsuchal/54e26cc6601bf737b94d to your computer and use it in GitHub Desktop.
Save jsuchal/54e26cc6601bf737b94d to your computer and use it in GitHub Desktop.
Infinity = 1.0 / 0
class MinHasher
def initialize(n, universe_size, seed = nil)
srand(seed) unless seed.nil?
@hash_functions = (1..n).collect do
create_random_hash_function(universe_size)
end
end
def generate_signature(set)
@hash_functions.collect do |hash_function|
calculate_minhash(hash_function, set)
end
end
private
def create_random_hash_function(universe_size)
a = rand(universe_size)
b = rand(universe_size)
lambda { |x| (a * x + b).modulo universe_size }
end
def calculate_minhash(hash_function, set)
minhash = Infinity
set.each do |item|
value = hash_function.call(item)
minhash = value if value < minhash
end
minhash
end
end
def jaccard(s1, s2)
(s1 & s2).size.to_f / (s1 | s2).size
end
def similarity(s1, s2)
matches = 0
s1.each_with_index do |hash, idx|
matches += 1 if hash == s2[idx]
end
matches / s1.size.to_f
end
minhasher = MinHasher.new(10, 12549826247007890, 1234567)
u1 = (1..10000).to_a
u2 = (5001..20000).to_a
s1 = minhasher.generate_signature u1
s2 = minhasher.generate_signature u2
puts "J(u1, u2) = #{jaccard(u1, u2)}", "sim(s1, s2) = #{similarity(s1, s2)}", nil
puts s1.inspect, s2.inspect
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment