├── test ├── test_helper.exs ├── rock │ ├── jaccard_coefficient_test.exs │ ├── utils_test.exs │ ├── struct │ │ ├── point_test.exs │ │ ├── cluster_test.exs │ │ └── heap_test.exs │ ├── neighbour_criterion_test.exs │ ├── links_test.exs │ ├── neighbours_test.exs │ ├── heaps_test.exs │ ├── algorithm_test.exs │ └── cluster_merge_criterion_test.exs ├── support │ └── test_factory.ex ├── rock_test.exs └── integration │ └── matrix_transformation_test.exs ├── CHANGELOG.md ├── .formatter.exs ├── lib ├── rock │ ├── neighbour_criterion.ex │ ├── jaccard_coefficient.ex │ ├── utils.ex │ ├── struct │ │ ├── cluster.ex │ │ ├── point.ex │ │ └── heap.ex │ ├── links.ex │ ├── neighbours.ex │ ├── algorithm.ex │ ├── cluster_merge_criterion.ex │ └── heaps.ex └── rock.ex ├── .gitignore ├── mix.exs ├── LICENSE ├── config └── config.exs ├── mix.lock └── README.md /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 0.1.2 2 | * Update deps (https://github.com/ayrat555/rock/pull/4) 3 | -------------------------------------------------------------------------------- /.formatter.exs: -------------------------------------------------------------------------------- 1 | [ 2 | inputs: [ 3 | "{lib,config,test}/**/*.{ex,exs}" 4 | ] 5 | ] 6 | -------------------------------------------------------------------------------- /lib/rock/neighbour_criterion.ex: -------------------------------------------------------------------------------- 1 | defmodule Rock.NeighbourCriterion do 2 | alias Rock.Struct.Point 3 | alias Rock.JaccardCoefficient 4 | @moduledoc false 5 | 6 | def new( 7 | theta, 8 | similarity_function \\ &JaccardCoefficient.measure/2 9 | ) do 10 | fn %Point{} = point1, %Point{} = point2 -> 11 | if similarity_function.(point1, point2) >= theta, do: 1, else: 0 12 | end 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /test/rock/jaccard_coefficient_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Rock.JaccardCoefficientTest do 2 | use ExUnit.Case 3 | alias Rock.Struct.Point 4 | alias Rock.JaccardCoefficient 5 | 6 | test "calculates jaccard coefficient" do 7 | point1 = Point.new(["1", "2", "3", "5", "6"]) 8 | point2 = Point.new(["1", "2", "7", "8"]) 9 | 10 | coefficient = JaccardCoefficient.measure(point1, point2) 11 | 12 | ^coefficient = 2 / 7 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /lib/rock/jaccard_coefficient.ex: -------------------------------------------------------------------------------- 1 | defmodule Rock.JaccardCoefficient do 2 | alias Rock.Struct.Point 3 | @moduledoc false 4 | 5 | def measure(%Point{} = point1, %Point{} = point2) do 6 | intersection_count(point1, point2) / union_count(point1, point2) 7 | end 8 | 9 | defp union_count(point1, point2) do 10 | point1 11 | |> Point.union(point2) 12 | |> Enum.count() 13 | end 14 | 15 | defp intersection_count(point1, point2) do 16 | point1 17 | |> Point.intersection(point2) 18 | |> Enum.count() 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /lib/rock/utils.ex: -------------------------------------------------------------------------------- 1 | defmodule Rock.Utils do 2 | alias Rock.Struct.Point 3 | alias Rock.Struct.Cluster 4 | @moduledoc false 5 | 6 | def internalize_points(points) when is_list(points) do 7 | points 8 | |> Enum.with_index() 9 | |> Enum.map(fn {{name, attributes}, index} -> 10 | Point.new(name, attributes, index) 11 | end) 12 | end 13 | 14 | def externalize_clusters(clusters) when is_list(clusters) do 15 | clusters 16 | |> Enum.map(fn %Cluster{points: points} -> 17 | points |> Enum.map(&Point.to_list/1) 18 | end) 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # The directory Mix will write compiled artifacts to. 2 | /_build 3 | 4 | # If you run "mix test --cover", coverage assets end up here. 5 | /cover 6 | 7 | # The directory Mix downloads your dependencies sources to. 8 | /deps 9 | 10 | # Where 3rd-party dependencies like ExDoc output generated docs. 11 | /doc 12 | 13 | # Ignore .fetch files in case you like to edit your project deps locally. 14 | /.fetch 15 | 16 | # If the VM crashes, it generates a dump, let's ignore it too. 17 | erl_crash.dump 18 | 19 | # Also ignore archive artifacts (built via "mix archive.build"). 20 | *.ez 21 | -------------------------------------------------------------------------------- /test/support/test_factory.ex: -------------------------------------------------------------------------------- 1 | defmodule Rock.Test.TestFactory do 2 | @moduledoc false 3 | 4 | alias Rock.Struct.Point 5 | alias Rock.Struct.Cluster 6 | alias Rock.Struct.Heap 7 | 8 | def from_string(:cluster, string_points) do 9 | string_points 10 | |> Enum.map(fn string_point -> 11 | from_string(:point, string_point) 12 | end) 13 | |> Cluster.new() 14 | end 15 | 16 | def from_string(:point, string_attributes) do 17 | string_attributes |> Point.new() 18 | end 19 | 20 | def create(:heap, items) do 21 | %Heap{cluster: %Cluster{uuid: UUID.uuid4()}, items: items} 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /test/rock/utils_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Rock.UtilsTest do 2 | use ExUnit.Case 3 | 4 | alias Rock.Utils 5 | alias Rock.Struct.Point 6 | 7 | test "internalizes points" do 8 | external_input = [ 9 | {"point1", ["1", "2", "3", "4", "5"]}, 10 | {"point2", ["1", "6", "7"]}, 11 | {"point3", ["5", "8", "8"]} 12 | ] 13 | 14 | points = external_input |> Utils.internalize_points() 15 | 16 | points 17 | |> Enum.reduce(0, fn %Point{attributes: attributes, index: index, name: name}, count -> 18 | assert external_input 19 | |> Enum.any?(fn {n, attrs} -> 20 | name == n && MapSet.new(attrs) == attributes 21 | end) 22 | 23 | ^index = count 24 | 25 | count + 1 26 | end) 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /test/rock/struct/point_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Rock.Struct.PointTest do 2 | use ExUnit.Case 3 | alias Rock.Struct.Point 4 | 5 | test "calculates intersection of two points" do 6 | point1 = Point.new(["1", "2", "5", "6", "9", "10"]) 7 | point2 = Point.new(["3", "4", "5", "6", "7", "8"]) 8 | 9 | intersection = Point.intersection(point1, point2) 10 | 11 | ^intersection = ["5", "6"] 12 | end 13 | 14 | test "calculates union of two points" do 15 | point1 = Point.new(["1", "2"]) 16 | point2 = Point.new(["3", "4", "5"]) 17 | 18 | intersection = Point.union(point1, point2) 19 | 20 | ^intersection = ["1", "2", "3", "4", "5"] 21 | end 22 | 23 | test "calculates attribute size of a point" do 24 | point = Point.new(["1", "2", "3", "4"]) 25 | 26 | attribute_size = point |> Point.attribute_size() 27 | 28 | ^attribute_size = 4 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/rock/struct/cluster.ex: -------------------------------------------------------------------------------- 1 | defmodule Rock.Struct.Cluster do 2 | defstruct points: [], size: 0, uuid: nil 3 | 4 | alias Rock.Struct.Cluster 5 | alias Rock.Struct.Point 6 | @moduledoc false 7 | 8 | def new(points) when is_list(points) do 9 | size = points |> Enum.count() 10 | 11 | %Cluster{points: points, size: size, uuid: UUID.uuid4()} 12 | end 13 | 14 | def add_point(%Cluster{points: points, size: size}, %Point{} = point) do 15 | new_points = points ++ [point] 16 | 17 | %Cluster{points: new_points, size: size + 1} 18 | end 19 | 20 | def merge( 21 | %Cluster{points: points1, size: size1}, 22 | %Cluster{points: points2, size: size2} 23 | ) do 24 | new_points = points1 ++ points2 25 | new_size = size1 + size2 26 | 27 | %Cluster{ 28 | points: new_points, 29 | size: new_size, 30 | uuid: UUID.uuid4() 31 | } 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /test/rock/neighbour_criterion_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Rock.NeighbourCriterionTest do 2 | use ExUnit.Case 3 | alias Rock.Struct.Point 4 | alias Rock.NeighbourCriterion 5 | 6 | test "check if points are neighbours with Jaccard Coefficient" do 7 | point1 = Point.new(["1", "2", "5"]) 8 | point2 = Point.new(["1", "5", "6"]) 9 | 10 | criterion = NeighbourCriterion.new(0.1) 11 | assert criterion.(point1, point2) == 1 12 | end 13 | 14 | test "check if points are neighbours with custom similarity function" do 15 | similarity_function = fn %Point{attributes: attributes1}, %Point{attributes: attributes2} -> 16 | Enum.count(attributes1) * Enum.count(attributes2) 17 | end 18 | 19 | point1 = Point.new(["1", "2", "5"]) 20 | point2 = Point.new(["1", "5", "6"]) 21 | 22 | criterion = NeighbourCriterion.new(100, similarity_function) 23 | assert criterion.(point1, point2) == 0 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule Rock.Mixfile do 2 | @moduledoc false 3 | 4 | use Mix.Project 5 | 6 | def project do 7 | [ 8 | app: :rock, 9 | version: "0.1.2", 10 | elixir: "~> 1.4", 11 | description: "ROCK: A Robust Clustering Algorithm for Categorical Attributes", 12 | package: [ 13 | maintainers: ["Ayrat Badykov"], 14 | licenses: ["MIT"], 15 | links: %{"GitHub" => "https://github.com/ayrat555/rock"} 16 | ], 17 | build_embedded: Mix.env() == :prod, 18 | start_permanent: Mix.env() == :prod, 19 | deps: deps(), 20 | elixirc_paths: elixirc_paths(Mix.env()) 21 | ] 22 | end 23 | 24 | def application do 25 | [extra_applications: [:logger]] 26 | end 27 | 28 | defp deps do 29 | [ 30 | {:credo, "~> 1.4", only: [:dev, :test], runtime: false}, 31 | {:elixir_uuid, "~> 1.2"}, 32 | {:apex, "~> 1.2", only: [:dev, :test]}, 33 | {:ex_doc, "~> 0.22", only: :dev, runtime: false} 34 | ] 35 | end 36 | 37 | defp elixirc_paths(:test), do: ["lib", "test/support"] 38 | defp elixirc_paths(_), do: ["lib"] 39 | end 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Ayrat Badykov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | # This file is responsible for configuring your application 2 | # and its dependencies with the aid of the Mix.Config module. 3 | use Mix.Config 4 | 5 | # This configuration is loaded before any dependency and is restricted 6 | # to this project. If another project depends on this project, this 7 | # file won't be loaded nor affect the parent project. For this reason, 8 | # if you want to provide default values for your application for 9 | # 3rd-party users, it should be done in your "mix.exs" file. 10 | 11 | # You can configure for your application as: 12 | # 13 | # config :rock, key: :value 14 | # 15 | # And access this configuration in your application as: 16 | # 17 | # Application.get_env(:rock, :key) 18 | # 19 | # Or configure a 3rd-party app: 20 | # 21 | # config :logger, level: :info 22 | # 23 | 24 | # It is also possible to import configuration files, relative to this 25 | # directory. For example, you can emulate configuration per environment 26 | # by uncommenting the line below and defining dev.exs, test.exs and such. 27 | # Configuration from the imported file will override the ones defined 28 | # here (which is why it is important to import them last). 29 | # 30 | # import_config "#{Mix.env}.exs" 31 | -------------------------------------------------------------------------------- /lib/rock/struct/point.ex: -------------------------------------------------------------------------------- 1 | defmodule Rock.Struct.Point do 2 | defstruct attributes: [], name: nil, index: nil 3 | 4 | alias Rock.Struct.Point 5 | @moduledoc false 6 | 7 | def new(name, attributes, index) when is_list(attributes) do 8 | attributes = MapSet.new(attributes) 9 | 10 | %Point{attributes: attributes, name: name, index: index} 11 | end 12 | 13 | def new(attributes) when is_list(attributes) do 14 | attributes = MapSet.new(attributes) 15 | 16 | %Point{attributes: attributes} 17 | end 18 | 19 | def intersection( 20 | %Point{attributes: attributes1}, 21 | %Point{attributes: attributes2} 22 | ) do 23 | attributes1 24 | |> MapSet.intersection(attributes2) 25 | |> MapSet.to_list() 26 | end 27 | 28 | def union( 29 | %Point{attributes: attributes1}, 30 | %Point{attributes: attributes2} 31 | ) do 32 | attributes1 33 | |> MapSet.union(attributes2) 34 | |> MapSet.to_list() 35 | end 36 | 37 | def attribute_size(%Point{attributes: attributes}) do 38 | attributes |> Enum.count() 39 | end 40 | 41 | def to_list(%Point{attributes: attributes, name: name}) do 42 | attr_list = attributes |> MapSet.to_list() 43 | 44 | {name, attr_list} 45 | end 46 | end 47 | -------------------------------------------------------------------------------- /test/rock/struct/cluster_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Rock.Struct.ClusterTest do 2 | use ExUnit.Case 3 | alias Rock.Struct.Cluster 4 | alias Rock.Test.TestFactory 5 | 6 | test "adds a point to a cluster" do 7 | point = TestFactory.from_string(:point, ["6"]) 8 | 9 | cluster = 10 | TestFactory.from_string( 11 | :cluster, 12 | [ 13 | ["1", "2", "3"], 14 | ["5"] 15 | ] 16 | ) 17 | 18 | %Cluster{points: points, size: size} = 19 | cluster 20 | |> Cluster.add_point(point) 21 | 22 | ^size = 3 23 | 24 | assert Enum.any?(points, fn p -> 25 | p == point 26 | end) 27 | end 28 | 29 | test "merges two clusters" do 30 | cluster1 = 31 | %Cluster{uuid: uuid1, points: points1} = 32 | TestFactory.from_string( 33 | :cluster, 34 | [ 35 | ["1", "2", "3"], 36 | ["5", "7"] 37 | ] 38 | ) 39 | 40 | cluster2 = 41 | %Cluster{uuid: uuid2, points: points2} = 42 | TestFactory.from_string( 43 | :cluster, 44 | [ 45 | ["1", "2"], 46 | ["5"] 47 | ] 48 | ) 49 | 50 | %Cluster{uuid: uuid3, points: new_points} = Cluster.merge(cluster1, cluster2) 51 | 52 | assert uuid3 != uuid1 53 | assert uuid3 != uuid2 54 | assert new_points == points1 ++ points2 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /lib/rock/links.ex: -------------------------------------------------------------------------------- 1 | defmodule Rock.Links do 2 | alias Rock.Neighbours 3 | @moduledoc false 4 | 5 | def matrix(points, neighbour_criterion) 6 | when is_list(points) do 7 | neighbour_lists = 8 | points 9 | |> Neighbours.list(neighbour_criterion) 10 | 11 | points 12 | |> Enum.count() 13 | |> initialize_links 14 | |> link_matrix(neighbour_lists) 15 | end 16 | 17 | defp initialize_links(size) do 18 | 1..size 19 | |> Enum.map(fn _ -> 20 | 1..size 21 | |> Enum.map(fn _ -> 22 | 0 23 | end) 24 | end) 25 | end 26 | 27 | defp link_matrix(link_matrix, [neighbour_list | []]) do 28 | link_matrix 29 | |> links_from_neighbours(neighbour_list) 30 | end 31 | 32 | defp link_matrix(link_matrix, [neighbour_list | tail]) do 33 | link_matrix 34 | |> links_from_neighbours(neighbour_list) 35 | |> link_matrix(tail) 36 | end 37 | 38 | defp links_from_neighbours(link_matrix, [_neighbour | []]) do 39 | link_matrix 40 | end 41 | 42 | defp links_from_neighbours(link_matrix, [neighbour | neighbour_tail]) do 43 | link_matrix 44 | |> add_links(neighbour_tail, neighbour) 45 | |> links_from_neighbours(neighbour_tail) 46 | end 47 | 48 | defp add_links(link_matrix, [neighbour | []], row_index) do 49 | link_matrix 50 | |> add_link(row_index, neighbour) 51 | end 52 | 53 | defp add_links(link_matrix, [neighbour | neighbour_tail], row_index) do 54 | link_matrix 55 | |> add_link(row_index, neighbour) 56 | |> add_links(neighbour_tail, row_index) 57 | end 58 | 59 | defp add_link(link_matrix, row_index, column_index) do 60 | link_matrix 61 | |> List.update_at(row_index, fn row -> 62 | row 63 | |> List.update_at(column_index, &(&1 + 1)) 64 | end) 65 | end 66 | end 67 | -------------------------------------------------------------------------------- /test/rock_test.exs: -------------------------------------------------------------------------------- 1 | defmodule RockTest do 2 | use ExUnit.Case 3 | 4 | alias Rock.Struct.Point 5 | 6 | @points [ 7 | {"point1", ["1", "2", "3"]}, 8 | {"point2", ["1", "2", "4"]}, 9 | {"point3", ["1", "2", "5"]}, 10 | {"point4", ["1", "3", "4"]}, 11 | {"point5", ["1", "3", "5"]}, 12 | {"point6", ["1", "4", "5"]}, 13 | {"point7", ["2", "3", "4"]}, 14 | {"point8", ["2", "3", "5"]}, 15 | {"point9", ["2", "4", "5"]}, 16 | {"point10", ["3", "4", "5"]}, 17 | {"point11", ["1", "2", "6"]}, 18 | {"point12", ["1", "2", "7"]}, 19 | {"point13", ["1", "6", "7"]}, 20 | {"point14", ["2", "6", "7"]} 21 | ] 22 | 23 | test "clusterizes points" do 24 | theta = 0.15 25 | number_of_clusters = 2 26 | 27 | result = @points |> Rock.clusterize(number_of_clusters, theta) 28 | 29 | [ 30 | [ 31 | {"point11", ["1", "2", "6"]}, 32 | {"point12", ["1", "2", "7"]}, 33 | {"point5", ["1", "3", "5"]}, 34 | {"point6", ["1", "4", "5"]}, 35 | {"point3", ["1", "2", "5"]}, 36 | {"point4", ["1", "3", "4"]}, 37 | {"point1", ["1", "2", "3"]}, 38 | {"point2", ["1", "2", "4"]}, 39 | {"point7", ["2", "3", "4"]}, 40 | {"point8", ["2", "3", "5"]}, 41 | {"point9", ["2", "4", "5"]}, 42 | {"point10", ["3", "4", "5"]} 43 | ], 44 | [ 45 | {"point13", ["1", "6", "7"]}, 46 | {"point14", ["2", "6", "7"]} 47 | ] 48 | ] = result 49 | end 50 | 51 | test "clusterizes points with custom similarity function" do 52 | theta = 0.5 53 | number_of_clusters = 2 54 | 55 | similarity_function = fn %Point{attributes: attributes1}, %Point{attributes: attributes2} -> 56 | count1 = Enum.count(attributes1) 57 | count2 = Enum.count(attributes2) 58 | 59 | if count1 >= count2, do: count2 / count1, else: count1 / count2 60 | end 61 | 62 | result = @points |> Rock.clusterize(number_of_clusters, theta, similarity_function) 63 | 64 | ^number_of_clusters = result |> Enum.count() 65 | end 66 | end 67 | -------------------------------------------------------------------------------- /lib/rock/neighbours.ex: -------------------------------------------------------------------------------- 1 | defmodule Rock.Neighbours do 2 | @moduledoc false 3 | 4 | def list(points, neighbour_criterion) when is_list(points) do 5 | points 6 | |> matrix(neighbour_criterion) 7 | |> index_list 8 | end 9 | 10 | def matrix(points, neighbour_criterion) when is_list(points) do 11 | points 12 | |> lower_triangle_matrix(neighbour_criterion) 13 | |> copy_to_upper_triangle 14 | end 15 | 16 | defp index_list(matrix) do 17 | matrix 18 | |> Enum.map(fn row -> 19 | row 20 | |> Enum.with_index() 21 | |> Enum.filter(fn {el, _index} -> 22 | el != 0 23 | end) 24 | |> Enum.map(fn {_el, index} -> 25 | index 26 | end) 27 | end) 28 | end 29 | 30 | defp lower_triangle_matrix(points, neighbour_criterion) do 31 | points 32 | |> Enum.with_index() 33 | |> Enum.map(fn {point1, row_index} -> 34 | lower_triangle_row(point1, row_index, points, neighbour_criterion) 35 | end) 36 | end 37 | 38 | defp lower_triangle_row(point1, row_index, points, similarity_function) do 39 | points 40 | |> Enum.with_index() 41 | |> Enum.map(fn {point2, column_index} -> 42 | if row_index >= column_index do 43 | similarity_function.(point1, point2) 44 | else 45 | 0 46 | end 47 | end) 48 | end 49 | 50 | defp copy_to_upper_triangle(lower_neighbor_matrix) do 51 | lower_neighbor_matrix 52 | |> Enum.with_index() 53 | |> Enum.map(fn {row, row_index} -> 54 | row 55 | |> Enum.with_index() 56 | |> Enum.map(fn {element, column_index} -> 57 | lower_neighbor_matrix 58 | |> lower_triangle_element(row_index, column_index, element) 59 | end) 60 | end) 61 | end 62 | 63 | defp lower_triangle_element(matrix, row_index, column_index, _element) 64 | when row_index < column_index do 65 | matrix 66 | |> Enum.at(column_index) 67 | |> Enum.at(row_index) 68 | end 69 | 70 | defp lower_triangle_element(_matrix, _row_index, _column_index, element) do 71 | element 72 | end 73 | end 74 | -------------------------------------------------------------------------------- /test/integration/matrix_transformation_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Rock.Integration.MatrixTransformationTest do 2 | use ExUnit.Case 3 | alias Rock.Struct.Point 4 | alias Rock.JaccardCoefficient 5 | 6 | test "creates similarity matrix with jaccard coefficient measure" do 7 | points = [ 8 | Point.new(["1", "2", "3"]), 9 | Point.new(["1", "2", "4"]), 10 | Point.new(["1", "2", "5"]), 11 | Point.new(["1", "3", "4"]), 12 | Point.new(["1", "3", "5"]), 13 | Point.new(["1", "4", "5"]), 14 | Point.new(["2", "3", "4"]), 15 | Point.new(["2", "3", "5"]), 16 | Point.new(["2", "4", "5"]), 17 | Point.new(["3", "4", "5"]), 18 | Point.new(["1", "2", "6"]), 19 | Point.new(["1", "2", "7"]), 20 | Point.new(["1", "6", "7"]), 21 | Point.new(["2", "6", "7"]) 22 | ] 23 | 24 | matrix = 25 | points 26 | |> Enum.map(fn point1 -> 27 | points 28 | |> Enum.map(fn point2 -> 29 | JaccardCoefficient.measure(point1, point2) 30 | end) 31 | end) 32 | 33 | [ 34 | [1.0, 0.5, 0.5, 0.5, 0.5, 0.2, 0.5, 0.5, 0.2, 0.2, 0.5, 0.5, 0.2, 0.2], 35 | [0.5, 1.0, 0.5, 0.5, 0.2, 0.5, 0.5, 0.2, 0.5, 0.2, 0.5, 0.5, 0.2, 0.2], 36 | [0.5, 0.5, 1.0, 0.2, 0.5, 0.5, 0.2, 0.5, 0.5, 0.2, 0.5, 0.5, 0.2, 0.2], 37 | [0.5, 0.5, 0.2, 1.0, 0.5, 0.5, 0.5, 0.2, 0.2, 0.5, 0.2, 0.2, 0.2, 0.0], 38 | [0.5, 0.2, 0.5, 0.5, 1.0, 0.5, 0.2, 0.5, 0.2, 0.5, 0.2, 0.2, 0.2, 0.0], 39 | [0.2, 0.5, 0.5, 0.5, 0.5, 1.0, 0.2, 0.2, 0.5, 0.5, 0.2, 0.2, 0.2, 0.0], 40 | [0.5, 0.5, 0.2, 0.5, 0.2, 0.2, 1.0, 0.5, 0.5, 0.5, 0.2, 0.2, 0.0, 0.2], 41 | [0.5, 0.2, 0.5, 0.2, 0.5, 0.2, 0.5, 1.0, 0.5, 0.5, 0.2, 0.2, 0.0, 0.2], 42 | [0.2, 0.5, 0.5, 0.2, 0.2, 0.5, 0.5, 0.5, 1.0, 0.5, 0.2, 0.2, 0.0, 0.2], 43 | [0.2, 0.2, 0.2, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1.0, 0.0, 0.0, 0.0, 0.0], 44 | [0.5, 0.5, 0.5, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.0, 1.0, 0.5, 0.5, 0.5], 45 | [0.5, 0.5, 0.5, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.0, 0.5, 1.0, 0.5, 0.5], 46 | [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 1.0, 0.5], 47 | [0.2, 0.2, 0.2, 0.0, 0.0, 0.0, 0.2, 0.2, 0.2, 0.0, 0.5, 0.5, 0.5, 1.0] 48 | ] = matrix 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /lib/rock/algorithm.ex: -------------------------------------------------------------------------------- 1 | defmodule Rock.Algorithm do 2 | alias Rock.Struct.Cluster 3 | alias Rock.NeighbourCriterion 4 | alias Rock.Links 5 | alias Rock.Heaps 6 | @moduledoc false 7 | 8 | def clusterize(points, number_of_clusters, theta, similarity_function \\ nil) 9 | when is_list(points) do 10 | neighbour_criterion = 11 | if is_nil(similarity_function) do 12 | theta |> NeighbourCriterion.new() 13 | else 14 | theta |> NeighbourCriterion.new(similarity_function) 15 | end 16 | 17 | link_matrix = points |> Links.matrix(neighbour_criterion) 18 | initial_clusters = points |> initialize_clusters 19 | current_number_of_clusters = points |> Enum.count() 20 | local_heaps = initial_clusters |> Heaps.initialize(link_matrix, theta) 21 | 22 | local_heaps 23 | |> optimize_clusters( 24 | initial_clusters, 25 | theta, 26 | number_of_clusters, 27 | current_number_of_clusters 28 | ) 29 | end 30 | 31 | defp initialize_clusters(points) do 32 | points 33 | |> Enum.map(fn point -> 34 | point 35 | |> List.wrap() 36 | |> Cluster.new() 37 | end) 38 | end 39 | 40 | defp optimize_clusters(_, _, _, necessary_number, current_number) 41 | when necessary_number > current_number do 42 | raise ArgumentError, 43 | message: "Needed number of clusters must be smaller than the number of points" 44 | end 45 | 46 | defp optimize_clusters(_, clusters, _, necessary_number, current_number) 47 | when necessary_number == current_number do 48 | clusters 49 | end 50 | 51 | defp optimize_clusters(local_heaps, clusters, theta, necessary_number, current_number) do 52 | global_heap = local_heaps |> Heaps.global_heap() 53 | {_, _, v_uuid, u_uuid} = global_heap |> Enum.at(0) 54 | v_cluster = clusters |> find_cluster(v_uuid) 55 | u_cluster = clusters |> find_cluster(u_uuid) 56 | 57 | {new_local_heap, new_cluster} = 58 | local_heaps 59 | |> Heaps.update(v_cluster, u_cluster, theta) 60 | 61 | new_clusters = 62 | clusters 63 | |> List.delete(v_cluster) 64 | |> List.delete(u_cluster) 65 | 66 | new_clusters = [new_cluster | new_clusters] 67 | 68 | optimize_clusters( 69 | new_local_heap, 70 | new_clusters, 71 | theta, 72 | necessary_number, 73 | current_number - 1 74 | ) 75 | end 76 | 77 | defp find_cluster(clusters, uuid) do 78 | clusters 79 | |> Enum.find(fn %Cluster{uuid: cluster_uuid} -> 80 | uuid == cluster_uuid 81 | end) 82 | end 83 | end 84 | -------------------------------------------------------------------------------- /test/rock/links_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Rock.LinksTest do 2 | use ExUnit.Case 3 | alias Rock.Links 4 | alias Rock.Struct.Point 5 | alias Rock.NeighbourCriterion 6 | 7 | test "calculates link matrix (example 1)" do 8 | criterion = NeighbourCriterion.new(0.1) 9 | 10 | points = [ 11 | Point.new(["1", "2", "3", "4", "5"]), 12 | Point.new(["1"]), 13 | Point.new(["5", "6", "7"]) 14 | ] 15 | 16 | link_matrix = 17 | points 18 | |> Links.matrix(criterion) 19 | 20 | ^link_matrix = [ 21 | [0, 2, 2], 22 | [0, 0, 1], 23 | [0, 0, 0] 24 | ] 25 | end 26 | 27 | test "calculates link matrix (example 2)" do 28 | criterion = NeighbourCriterion.new(0.5) 29 | 30 | points = [ 31 | Point.new(["1", "2", "3"]), 32 | Point.new(["1", "2", "4"]), 33 | Point.new(["1", "2", "5"]), 34 | Point.new(["1", "3", "4"]), 35 | Point.new(["1", "3", "5"]), 36 | Point.new(["1", "4", "5"]), 37 | Point.new(["2", "3", "4"]), 38 | Point.new(["2", "3", "5"]), 39 | Point.new(["2", "4", "5"]), 40 | Point.new(["3", "4", "5"]), 41 | Point.new(["1", "2", "6"]), 42 | Point.new(["1", "2", "7"]), 43 | Point.new(["1", "6", "7"]), 44 | Point.new(["2", "6", "7"]) 45 | ] 46 | 47 | link_matrix = 48 | points 49 | |> Links.matrix(criterion) 50 | 51 | ^link_matrix = [ 52 | [0, 7, 7, 5, 5, 4, 5, 5, 4, 4, 5, 5, 2, 2], 53 | [0, 0, 7, 5, 4, 5, 5, 4, 5, 4, 5, 5, 2, 2], 54 | [0, 0, 0, 4, 5, 5, 4, 5, 5, 4, 5, 5, 2, 2], 55 | [0, 0, 0, 0, 5, 5, 5, 4, 4, 5, 2, 2, 0, 0], 56 | [0, 0, 0, 0, 0, 5, 4, 5, 4, 5, 2, 2, 0, 0], 57 | [0, 0, 0, 0, 0, 0, 4, 4, 5, 5, 2, 2, 0, 0], 58 | [0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 2, 2, 0, 0], 59 | [0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 2, 2, 0, 0], 60 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 2, 2, 0, 0], 61 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 62 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 4, 4], 63 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4], 64 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4], 65 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 66 | ] 67 | end 68 | 69 | test "calculates link matrix (example 3)" do 70 | criterion = NeighbourCriterion.new(0.2) 71 | 72 | points = [ 73 | Point.new(["1"]), 74 | Point.new(["2", "3", "4", "5"]), 75 | Point.new(["2", "3", "4", "6"]), 76 | Point.new(["4", "6"]) 77 | ] 78 | 79 | link_matrix = 80 | points 81 | |> Links.matrix(criterion) 82 | 83 | ^link_matrix = [ 84 | [0, 0, 0, 0], 85 | [0, 0, 3, 3], 86 | [0, 0, 0, 3], 87 | [0, 0, 0, 0] 88 | ] 89 | end 90 | end 91 | -------------------------------------------------------------------------------- /lib/rock/cluster_merge_criterion.ex: -------------------------------------------------------------------------------- 1 | defmodule Rock.ClusterMergeCriterion do 2 | alias Rock.Struct.Point 3 | alias Rock.Struct.Cluster 4 | @moduledoc false 5 | 6 | def measure(%Cluster{size: size1}, %Cluster{size: size2}, theta, cross_link_count) do 7 | power = 1 + 2 * f_theta(theta) 8 | summand1 = :math.pow(size1 + size2, power) 9 | summand2 = :math.pow(size1, power) 10 | summand3 = :math.pow(size2, power) 11 | 12 | measure = cross_link_count / (summand1 - summand2 - summand3) 13 | 14 | measure 15 | end 16 | 17 | def measure( 18 | link_matrix, 19 | %Cluster{} = cluster1, 20 | %Cluster{} = cluster2, 21 | theta 22 | ) do 23 | cross_link_count = count_cross_links(link_matrix, cluster1, cluster2) 24 | measure = measure(cluster1, cluster2, theta, cross_link_count) 25 | 26 | {measure, cross_link_count} 27 | end 28 | 29 | def count_cross_links( 30 | link_matrix, 31 | %Cluster{points: points1}, 32 | %Cluster{points: points2} 33 | ) do 34 | count_cross_links(link_matrix, points1, points2, 0) 35 | end 36 | 37 | defp count_cross_links( 38 | link_matrix, 39 | [point1 | []], 40 | second_cluster_points, 41 | count 42 | ) do 43 | count_cross_links(link_matrix, point1, second_cluster_points, count) 44 | end 45 | 46 | defp count_cross_links( 47 | link_matrix, 48 | [point1 | tail], 49 | second_cluster_points, 50 | count 51 | ) do 52 | new_count = 53 | count + 54 | count_cross_links(link_matrix, point1, second_cluster_points, count) 55 | 56 | count_cross_links(link_matrix, tail, second_cluster_points, new_count) 57 | end 58 | 59 | defp count_cross_links( 60 | link_matrix, 61 | %Point{index: index1}, 62 | [%Point{index: index2} | []], 63 | count 64 | ) do 65 | count + number_of_links(link_matrix, index1, index2) 66 | end 67 | 68 | defp count_cross_links( 69 | link_matrix, 70 | %Point{index: index1} = point1, 71 | [%Point{index: index2} | tail], 72 | count 73 | ) do 74 | new_count = count + number_of_links(link_matrix, index1, index2) 75 | 76 | count_cross_links(link_matrix, point1, tail, new_count) 77 | end 78 | 79 | defp number_of_links(link_matrix, index1, index2) do 80 | # because our link matrix is symmetric and we have zeros under main diagonal 81 | {index1, index2} = 82 | if index1 > index2, 83 | do: {index2, index1}, 84 | else: {index1, index2} 85 | 86 | link_matrix 87 | |> Enum.at(index1) 88 | |> Enum.at(index2) 89 | end 90 | 91 | defp f_theta(theta) do 92 | (1 - theta) / (1 + theta) 93 | end 94 | end 95 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "apex": {:hex, :apex, "1.2.1", "297f5dac23fa2a32648b890a0838fce2772114010e0b9ec975cae6021cc5a092", [:mix], [], "hexpm", "379e2515fa5da7a5ac91ecceba782169d1a734e7e09e5f473e4e85576728b65f"}, 3 | "bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], [], "hexpm", "7af5c7e09fe1d40f76c8e4f9dd2be7cebd83909f31fee7cd0e9eadc567da8353"}, 4 | "credo": {:hex, :credo, "1.4.0", "92339d4cbadd1e88b5ee43d427b639b68a11071b6f73854e33638e30a0ea11f5", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "1fd3b70dce216574ce3c18bdf510b57e7c4c85c2ec9cad4bff854abaf7e58658"}, 5 | "earmark": {:hex, :earmark, "1.4.9", "837e4c1c5302b3135e9955f2bbf52c6c52e950c383983942b68b03909356c0d9", [:mix], [{:earmark_parser, ">= 1.4.9", [hex: :earmark_parser, repo: "hexpm", optional: false]}], "hexpm", "0d72df7d13a3dc8422882bed5263fdec5a773f56f7baeb02379361cb9e5b0d8e"}, 6 | "earmark_parser": {:hex, :earmark_parser, "1.4.9", "819bda2049e6ee1365424e4ced1ba65806eacf0d2867415f19f3f80047f8037b", [:mix], [], "hexpm", "8bf54fddabf2d7e137a0c22660e71b49d5a0a82d1fb05b5af62f2761cd6485c4"}, 7 | "elixir_uuid": {:hex, :elixir_uuid, "1.2.1", "dce506597acb7e6b0daeaff52ff6a9043f5919a4c3315abb4143f0b00378c097", [:mix], [], "hexpm", "f7eba2ea6c3555cea09706492716b0d87397b88946e6380898c2889d68585752"}, 8 | "ex_doc": {:hex, :ex_doc, "0.22.1", "9bb6d51508778193a4ea90fa16eac47f8b67934f33f8271d5e1edec2dc0eee4c", [:mix], [{:earmark, "~> 1.4.0", [hex: :earmark, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm", "d957de1b75cb9f78d3ee17820733dc4460114d8b1e11f7ee4fd6546e69b1db60"}, 9 | "jason": {:hex, :jason, "1.2.1", "12b22825e22f468c02eb3e4b9985f3d0cb8dc40b9bd704730efa11abd2708c44", [:mix], [{:decimal, "~> 1.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "b659b8571deedf60f79c5a608e15414085fa141344e2716fbd6988a084b5f993"}, 10 | "makeup": {:hex, :makeup, "1.0.3", "e339e2f766d12e7260e6672dd4047405963c5ec99661abdc432e6ec67d29ef95", [:mix], [{:nimble_parsec, "~> 0.5", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "2e9b4996d11832947731f7608fed7ad2f9443011b3b479ae288011265cdd3dad"}, 11 | "makeup_elixir": {:hex, :makeup_elixir, "0.14.1", "4f0e96847c63c17841d42c08107405a005a2680eb9c7ccadfd757bd31dabccfb", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "f2438b1a80eaec9ede832b5c41cd4f373b38fd7aa33e3b22d9db79e640cbde11"}, 12 | "nimble_parsec": {:hex, :nimble_parsec, "0.6.0", "32111b3bf39137144abd7ba1cce0914533b2d16ef35e8abc5ec8be6122944263", [:mix], [], "hexpm", "27eac315a94909d4dc68bc07a4a83e06c8379237c5ea528a9acff4ca1c873c52"}, 13 | "uuid": {:hex, :uuid, "1.1.7", "007afd58273bc0bc7f849c3bdc763e2f8124e83b957e515368c498b641f7ab69", [:mix], [], "hexpm"}, 14 | } 15 | -------------------------------------------------------------------------------- /lib/rock/heaps.ex: -------------------------------------------------------------------------------- 1 | defmodule Rock.Heaps do 2 | alias Rock.Struct.Heap 3 | alias Rock.Struct.Cluster 4 | @moduledoc false 5 | 6 | def initialize(clusters, link_matrix, theta) do 7 | clusters 8 | |> Enum.map(fn cluster -> 9 | remaining_clusters = 10 | clusters 11 | |> List.delete(cluster) 12 | 13 | cluster |> Heap.new(remaining_clusters, link_matrix, theta) 14 | end) 15 | end 16 | 17 | def update( 18 | heaps, 19 | %Cluster{uuid: v_uuid} = v_cluster, 20 | %Cluster{uuid: u_uuid} = u_cluster, 21 | theta 22 | ) do 23 | w_cluster = v_cluster |> Cluster.merge(u_cluster) 24 | 25 | new_heaps = 26 | heaps 27 | |> Enum.map(fn heap -> 28 | v_item = heap |> Heap.find_item(v_uuid) 29 | u_item = heap |> Heap.find_item(u_uuid) 30 | 31 | cross_link_count = 32 | case {v_item, u_item} do 33 | {nil, {_, cross_link_count, _}} -> 34 | cross_link_count 35 | 36 | {{_, cross_link_count, _}, nil} -> 37 | cross_link_count 38 | 39 | {{_, v_cross_link_count, _}, {_, u_cross_link_count, _}} -> 40 | v_cross_link_count + u_cross_link_count 41 | 42 | {nil, nil} -> 43 | 0 44 | end 45 | 46 | heap = 47 | heap 48 | |> Heap.remove_item(v_uuid) 49 | |> Heap.remove_item(u_uuid) 50 | 51 | if cross_link_count == 0 do 52 | heap 53 | else 54 | heap 55 | |> Heap.add_item(w_cluster, cross_link_count, theta) 56 | |> Heap.sort_items() 57 | end 58 | end) 59 | 60 | new_heaps = 61 | new_heaps 62 | |> remove_heap(v_uuid) 63 | |> remove_heap(u_uuid) 64 | 65 | # need optimization, move to heaps update ^ 66 | w_heap = new_heaps |> construct_w_heap(w_cluster) 67 | 68 | {[w_heap | new_heaps], w_cluster} 69 | end 70 | 71 | def global_heap(heaps) do 72 | heaps 73 | |> Enum.map(fn %Heap{items: items, cluster: %Cluster{uuid: uuid}} -> 74 | {measure, cross_link_count, cluster_uuid} = items |> Enum.at(0) 75 | 76 | {measure, cross_link_count, uuid, cluster_uuid} 77 | end) 78 | |> Enum.sort_by(fn {measure, _, _, _} -> 79 | -measure 80 | end) 81 | end 82 | 83 | defp remove_heap(heaps, uuid) do 84 | heaps 85 | |> Enum.filter(fn %Heap{cluster: %Cluster{uuid: cluster_uuid}} -> 86 | uuid != cluster_uuid 87 | end) 88 | end 89 | 90 | defp construct_w_heap(heaps, %Cluster{uuid: w_uuid} = w_cluster) do 91 | items = 92 | heaps 93 | |> Enum.map(fn %Heap{cluster: %Cluster{uuid: uuid}} = heap -> 94 | item = heap |> Heap.find_item(w_uuid) 95 | {uuid, item} 96 | end) 97 | |> Enum.filter(fn {_, item} -> 98 | item != nil 99 | end) 100 | |> Enum.map(fn {uuid, {measure, cross_link_count, _}} -> 101 | {measure, cross_link_count, uuid} 102 | end) 103 | 104 | %Heap{cluster: w_cluster, items: items} |> Heap.sort_items() 105 | end 106 | end 107 | -------------------------------------------------------------------------------- /test/rock/neighbours_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Rock.NeighboursTest do 2 | use ExUnit.Case 3 | alias Rock.Struct.Point 4 | alias Rock.Neighbours 5 | alias Rock.NeighbourCriterion 6 | 7 | @points [ 8 | Point.new(["1", "2", "3", "4", "5"]), 9 | Point.new(["1"]), 10 | Point.new(["5", "6", "7"]) 11 | ] 12 | 13 | test "calculates neighbor matrix with jaccard coefficient (example 1)" do 14 | criterion = NeighbourCriterion.new(0.1) 15 | 16 | neighbor_matrix = 17 | @points 18 | |> Neighbours.matrix(criterion) 19 | 20 | ^neighbor_matrix = [ 21 | [1, 1, 1], 22 | [1, 1, 0], 23 | [1, 0, 1] 24 | ] 25 | end 26 | 27 | test "calculates neighbor matrix with jaccard coefficient (example 2)" do 28 | criterion = NeighbourCriterion.new(0.5) 29 | 30 | points = [ 31 | Point.new(["1", "2", "3"]), 32 | Point.new(["1", "2", "4"]), 33 | Point.new(["1", "2", "5"]), 34 | Point.new(["1", "3", "4"]), 35 | Point.new(["1", "3", "5"]), 36 | Point.new(["1", "4", "5"]), 37 | Point.new(["2", "3", "4"]), 38 | Point.new(["2", "3", "5"]), 39 | Point.new(["2", "4", "5"]), 40 | Point.new(["3", "4", "5"]), 41 | Point.new(["1", "2", "6"]), 42 | Point.new(["1", "2", "7"]), 43 | Point.new(["1", "6", "7"]), 44 | Point.new(["2", "6", "7"]) 45 | ] 46 | 47 | neighbor_matrix = 48 | points 49 | |> Neighbours.matrix(criterion) 50 | 51 | [ 52 | [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0], 53 | [1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0], 54 | [1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0], 55 | [1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0], 56 | [1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0], 57 | [0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0], 58 | [1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0], 59 | [1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0], 60 | [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0], 61 | [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], 62 | [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], 63 | [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], 64 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], 65 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1] 66 | ] = neighbor_matrix 67 | end 68 | 69 | test "calculates neighbor matrix with custom similarity function" do 70 | similarity_function = fn %Point{attributes: attributes1}, %Point{attributes: attributes2} -> 71 | Enum.count(attributes1) * Enum.count(attributes2) 72 | end 73 | 74 | criterion = NeighbourCriterion.new(10, similarity_function) 75 | 76 | neighbor_matrix = 77 | @points 78 | |> Neighbours.matrix(criterion) 79 | 80 | ^neighbor_matrix = [ 81 | [1, 0, 1], 82 | [0, 0, 0], 83 | [1, 0, 0] 84 | ] 85 | end 86 | 87 | test "returns neighbor indices list" do 88 | criterion = NeighbourCriterion.new(0.1) 89 | 90 | neighbor_list = 91 | @points 92 | |> Neighbours.list(criterion) 93 | 94 | ^neighbor_list = [ 95 | [0, 1, 2], 96 | [0, 1], 97 | [0, 2] 98 | ] 99 | end 100 | end 101 | -------------------------------------------------------------------------------- /test/rock/heaps_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Rock.HeapsTest do 2 | use ExUnit.Case 3 | 4 | alias Rock.Heaps 5 | alias Rock.Links 6 | alias Rock.NeighbourCriterion 7 | alias Rock.Struct.Point 8 | alias Rock.Struct.Cluster 9 | alias Rock.Struct.Heap 10 | 11 | setup do 12 | theta = 0.5 13 | criterion = NeighbourCriterion.new(theta) 14 | 15 | points = [ 16 | Point.new("1", ["1", "2", "3"], 0), 17 | Point.new("2", ["1", "2", "4"], 1), 18 | Point.new("3", ["1", "2", "5"], 2), 19 | Point.new("4", ["1", "3", "4"], 3), 20 | Point.new("5", ["1", "3", "5"], 4), 21 | Point.new("6", ["1", "4", "5"], 5), 22 | Point.new("7", ["2", "3", "4"], 6), 23 | Point.new("8", ["2", "3", "5"], 7), 24 | Point.new("9", ["2", "4", "5"], 8), 25 | Point.new("10", ["3", "4", "5"], 9), 26 | Point.new("11", ["1", "2", "6"], 10), 27 | Point.new("12", ["1", "2", "7"], 11), 28 | Point.new("13", ["1", "6", "7"], 12), 29 | Point.new("14", ["2", "6", "7"], 13) 30 | ] 31 | 32 | link_matrix = 33 | points 34 | |> Links.matrix(criterion) 35 | 36 | point_clusters = 37 | points 38 | |> Enum.chunk_by(fn %Point{attributes: attrs} -> 39 | attrs |> Enum.at(0) == "1" 40 | end) 41 | 42 | clusters = 43 | point_clusters 44 | |> Enum.map(&Cluster.new(&1)) 45 | 46 | { 47 | :ok, 48 | [ 49 | clusters: clusters, 50 | link_matrix: link_matrix, 51 | theta: theta 52 | ] 53 | } 54 | end 55 | 56 | test "initializes heap list", %{clusters: clusters, link_matrix: link_matrix, theta: theta} do 57 | heaps = clusters |> Heaps.initialize(link_matrix, theta) 58 | 59 | clusters 60 | |> Enum.each(fn %Cluster{uuid: uuid} -> 61 | exists = 62 | heaps 63 | |> Enum.map(fn %Heap{cluster: %Cluster{uuid: cluster_uuid}} -> 64 | cluster_uuid == uuid 65 | end) 66 | 67 | assert exists 68 | end) 69 | end 70 | 71 | test "updates heap list", %{clusters: clusters, link_matrix: link_matrix, theta: theta} do 72 | heaps = clusters |> Heaps.initialize(link_matrix, theta) 73 | cluster1 = %Cluster{uuid: uuid1} = clusters |> Enum.at(0) 74 | cluster2 = %Cluster{uuid: uuid2} = clusters |> Enum.at(1) 75 | 76 | {new_heaps, _cluster3} = heaps |> Heaps.update(cluster1, cluster2, theta) 77 | 78 | assert Enum.count(heaps) == Enum.count(new_heaps) + 1 79 | 80 | refute new_heaps 81 | |> Enum.any?(fn %Heap{cluster: %Cluster{uuid: uuid}} -> 82 | uuid == uuid1 || uuid == uuid2 83 | end) 84 | end 85 | 86 | test "creates global heap from heap list", %{ 87 | clusters: clusters, 88 | link_matrix: link_matrix, 89 | theta: theta 90 | } do 91 | heaps = clusters |> Heaps.initialize(link_matrix, theta) 92 | 93 | global_heap = heaps |> Heaps.global_heap() 94 | 95 | clusters 96 | |> Enum.each(fn %Cluster{uuid: uuid} -> 97 | assert global_heap 98 | |> Enum.any?(fn {_, _, cluster_uuid, _} -> 99 | uuid == cluster_uuid 100 | end) 101 | end) 102 | end 103 | end 104 | -------------------------------------------------------------------------------- /lib/rock.ex: -------------------------------------------------------------------------------- 1 | defmodule Rock do 2 | alias Rock.Utils 3 | alias Rock.Algorithm 4 | 5 | @moduledoc """ 6 | ROCK: A Robust Clustering Algorithm for Categorical Attributes 7 | """ 8 | 9 | @doc """ 10 | Clusterizes points using the Rock algorithm with the provided arguments: 11 | 12 | * `points`, points that will be clusterized 13 | * `number_of_clusters`, the number of desired clusters. 14 | * `theta`, neighborhood parameter in the range [0,1). Default value is 0.5. 15 | * `similarity_function`, distance function to use. Jaccard Coefficient is used by default. 16 | 17 | ## Examples 18 | 19 | points = 20 | [ 21 | {"point1", ["1", "2", "3"]}, 22 | {"point2", ["1", "2", "4"]}, 23 | {"point3", ["1", "2", "5"]}, 24 | {"point4", ["1", "3", "4"]}, 25 | {"point5", ["1", "3", "5"]}, 26 | {"point6", ["1", "4", "5"]}, 27 | {"point7", ["2", "3", "4"]}, 28 | {"point8", ["2", "3", "5"]}, 29 | {"point9", ["2", "4", "5"]}, 30 | {"point10", ["3", "4", "5"]}, 31 | {"point11", ["1", "2", "6"]}, 32 | {"point12", ["1", "2", "7"]}, 33 | {"point13", ["1", "6", "7"]}, 34 | {"point14", ["2", "6", "7"]} 35 | ] 36 | 37 | # Example 1 38 | 39 | Rock.clusterize(points, 5, 0.4) 40 | [ 41 | [ 42 | {"point4", ["1", "3", "4"]}, 43 | {"point5", ["1", "3", "5"]}, 44 | {"point6", ["1", "4", "5"]}, 45 | {"point10", ["3", "4", "5"]}, 46 | {"point7", ["2", "3", "4"]}, 47 | {"point8", ["2", "3", "5"]} 48 | ], 49 | [ 50 | {"point11", ["1", "2", "6"]}, 51 | {"point12", ["1", "2", "7"]}, 52 | {"point1", ["1", "2", "3"]}, 53 | {"point2", ["1", "2", "4"]}, 54 | {"point3", ["1", "2", "5"]} 55 | ], 56 | [ 57 | {"point9", ["2", "4", "5"]} 58 | ], 59 | [ 60 | {"point13", ["1", "6", "7"]} 61 | ], 62 | [ 63 | {"point14", ["2", "6", "7"]} 64 | ] 65 | ] 66 | 67 | # Example 2 (with custom similarity function) 68 | 69 | similarity_function = fn( 70 | %Rock.Struct.Point{attributes: attributes1}, 71 | %Rock.Struct.Point{attributes: attributes2}) -> 72 | 73 | count1 = Enum.count(attributes1) 74 | count2 = Enum.count(attributes2) 75 | 76 | if count1 >= count2, do: (count2 - 1) / count1, else: (count1 - 1) / count2 77 | end 78 | 79 | Rock.clusterize(points, 4, 0.5, similarity_function) 80 | [ 81 | [ 82 | {"point1", ["1", "2", "3"]}, 83 | {"point2", ["1", "2", "4"]}, 84 | {"point3", ["1", "2", "5"]}, 85 | {"point4", ["1", "3", "4"]}, 86 | {"point5", ["1", "3", "5"]}, 87 | {"point6", ["1", "4", "5"]}, 88 | {"point7", ["2", "3", "4"]}, 89 | {"point8", ["2", "3", "5"]}, 90 | {"point9", ["2", "4", "5"]}, 91 | {"point10", ["3", "4", "5"]}, 92 | {"point11", ["1", "2", "6"]} 93 | ], 94 | [ 95 | {"point12", ["1", "2", "7"]} 96 | ], 97 | [ 98 | {"point13", ["1", "6", "7"]} 99 | ], 100 | [ 101 | {"point14", ["2", "6", "7"]} 102 | ] 103 | ] 104 | 105 | """ 106 | 107 | def clusterize(points, number_of_clusters, theta \\ 0.5, similarity_function \\ nil) 108 | when is_list(points) 109 | when is_number(number_of_clusters) 110 | when is_number(theta) 111 | when is_function(similarity_function) do 112 | points 113 | |> Utils.internalize_points() 114 | |> Algorithm.clusterize(number_of_clusters, theta, similarity_function) 115 | |> Utils.externalize_clusters() 116 | end 117 | end 118 | -------------------------------------------------------------------------------- /test/rock/algorithm_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Rock.AlgorithmTest do 2 | use ExUnit.Case 3 | 4 | alias Rock.Algorithm 5 | alias Rock.Utils 6 | 7 | @points [ 8 | {"point1", ["1", "2", "3"]}, 9 | {"point2", ["1", "2", "4"]}, 10 | {"point3", ["1", "2", "5"]}, 11 | {"point4", ["1", "3", "4"]}, 12 | {"point5", ["1", "3", "5"]}, 13 | {"point6", ["1", "4", "5"]}, 14 | {"point7", ["2", "3", "4"]}, 15 | {"point8", ["2", "3", "5"]}, 16 | {"point9", ["2", "4", "5"]}, 17 | {"point10", ["3", "4", "5"]}, 18 | {"point11", ["1", "2", "6"]}, 19 | {"point12", ["1", "2", "7"]}, 20 | {"point13", ["1", "6", "7"]}, 21 | {"point14", ["2", "6", "7"]} 22 | ] 23 | |> Utils.internalize_points() 24 | @number_of_clusters 5 25 | 26 | test "clusterizes points with theta = 0.1" do 27 | theta = 0.1 28 | 29 | result = 30 | @points 31 | |> Algorithm.clusterize(@number_of_clusters, theta) 32 | |> Utils.externalize_clusters() 33 | 34 | [ 35 | [ 36 | {"point5", ["1", "3", "5"]}, 37 | {"point6", ["1", "4", "5"]}, 38 | {"point10", ["3", "4", "5"]} 39 | ], 40 | [ 41 | {"point11", ["1", "2", "6"]}, 42 | {"point12", ["1", "2", "7"]}, 43 | {"point13", ["1", "6", "7"]} 44 | ], 45 | [ 46 | {"point3", ["1", "2", "5"]}, 47 | {"point4", ["1", "3", "4"]}, 48 | {"point1", ["1", "2", "3"]}, 49 | {"point2", ["1", "2", "4"]} 50 | ], 51 | [ 52 | {"point7", ["2", "3", "4"]}, 53 | {"point8", ["2", "3", "5"]}, 54 | {"point9", ["2", "4", "5"]} 55 | ], 56 | [ 57 | {"point14", ["2", "6", "7"]} 58 | ] 59 | ] = result 60 | end 61 | 62 | test "clusterizes points with theta = 0.2" do 63 | theta = 0.2 64 | 65 | result = 66 | @points 67 | |> Algorithm.clusterize(@number_of_clusters, theta) 68 | |> Utils.externalize_clusters() 69 | 70 | [ 71 | [ 72 | {"point3", ["1", "2", "5"]}, 73 | {"point4", ["1", "3", "4"]}, 74 | {"point1", ["1", "2", "3"]}, 75 | {"point2", ["1", "2", "4"]}, 76 | {"point7", ["2", "3", "4"]}, 77 | {"point8", ["2", "3", "5"]}, 78 | {"point9", ["2", "4", "5"]} 79 | ], 80 | [ 81 | {"point11", ["1", "2", "6"]}, 82 | {"point12", ["1", "2", "7"]}, 83 | {"point5", ["1", "3", "5"]}, 84 | {"point6", ["1", "4", "5"]} 85 | ], 86 | [ 87 | {"point10", ["3", "4", "5"]} 88 | ], 89 | [ 90 | {"point13", ["1", "6", "7"]} 91 | ], 92 | [ 93 | {"point14", ["2", "6", "7"]} 94 | ] 95 | ] = result 96 | end 97 | 98 | test "clusterizes points with theta = 0.3" do 99 | theta = 0.3 100 | 101 | result = 102 | @points 103 | |> Algorithm.clusterize(@number_of_clusters, theta) 104 | |> Utils.externalize_clusters() 105 | 106 | [ 107 | [ 108 | {"point7", ["2", "3", "4"]}, 109 | {"point8", ["2", "3", "5"]}, 110 | {"point10", ["3", "4", "5"]}, 111 | {"point6", ["1", "4", "5"]}, 112 | {"point9", ["2", "4", "5"]} 113 | ], 114 | [ 115 | {"point1", ["1", "2", "3"]}, 116 | {"point2", ["1", "2", "4"]}, 117 | {"point3", ["1", "2", "5"]}, 118 | {"point11", ["1", "2", "6"]}, 119 | {"point12", ["1", "2", "7"]} 120 | ], 121 | [ 122 | {"point4", ["1", "3", "4"]}, 123 | {"point5", ["1", "3", "5"]} 124 | ], 125 | [ 126 | {"point13", ["1", "6", "7"]} 127 | ], 128 | [ 129 | {"point14", ["2", "6", "7"]} 130 | ] 131 | ] = result 132 | end 133 | end 134 | -------------------------------------------------------------------------------- /lib/rock/struct/heap.ex: -------------------------------------------------------------------------------- 1 | defmodule Rock.Struct.Heap do 2 | defstruct cluster: nil, items: [] 3 | 4 | alias Rock.Struct.Cluster 5 | alias Rock.Struct.Heap 6 | alias Rock.ClusterMergeCriterion 7 | @moduledoc false 8 | 9 | def new( 10 | %Cluster{} = cluster, 11 | other_clusters, 12 | link_matrix, 13 | theta 14 | ) do 15 | if other_clusters |> Enum.member?(cluster), 16 | do: raise(ArgumentError, message: "cluster can not be member of heap items clusters") 17 | 18 | items = cluster |> prepare_items(other_clusters, link_matrix, theta) 19 | 20 | %Heap{cluster: cluster, items: items} 21 | end 22 | 23 | def remove_item(%Heap{items: items, cluster: cluster}, uuid) do 24 | new_items = items |> _remove_item(items, uuid) 25 | 26 | %Heap{cluster: cluster, items: new_items} 27 | end 28 | 29 | def add_item( 30 | %Heap{items: items, cluster: heap_cluster}, 31 | %Cluster{uuid: uuid} = cluster, 32 | cross_link_count, 33 | theta 34 | ) do 35 | if uuid |> exists_in_items?(items), 36 | do: raise(ArgumentError, message: "cluster is already member of the heap") 37 | 38 | new_item = heap_cluster |> calculate_item(cluster, cross_link_count, theta) 39 | new_items = [new_item | items] |> sort 40 | 41 | %Heap{cluster: heap_cluster, items: new_items} 42 | end 43 | 44 | def find_item( 45 | %Heap{cluster: %Cluster{uuid: cluster_uuid}}, 46 | uuid 47 | ) 48 | when uuid == cluster_uuid do 49 | nil 50 | end 51 | 52 | def find_item(%Heap{items: items}, uuid) do 53 | items 54 | |> Enum.find(fn {_, _, cluster_uuid} -> 55 | cluster_uuid == uuid 56 | end) 57 | end 58 | 59 | def sort_items(%Heap{cluster: cluster, items: items}) do 60 | new_items = items |> sort 61 | 62 | %Heap{cluster: cluster, items: new_items} 63 | end 64 | 65 | defp exists_in_items?(uuid, items) do 66 | items 67 | |> Enum.any?(fn {_, _, cluster_uuid} -> 68 | cluster_uuid == uuid 69 | end) 70 | end 71 | 72 | defp _remove_item(items, [{_, _, cluster_uuid} = item | _], uuid) when cluster_uuid == uuid do 73 | items |> List.delete(item) 74 | end 75 | 76 | defp _remove_item(items, [], _uuid) do 77 | items 78 | end 79 | 80 | defp _remove_item(items, [_item | tail], uuid) do 81 | items |> _remove_item(tail, uuid) 82 | end 83 | 84 | defp prepare_items(cluster, clusters, link_matrix, theta) do 85 | clusters 86 | |> calculate_items(cluster, link_matrix, theta) 87 | |> remove_empty_links 88 | |> sort 89 | end 90 | 91 | defp calculate_items(clusters, cluster, link_matrix, theta) do 92 | clusters 93 | |> Enum.map(&calculate_item(cluster, &1, link_matrix, theta)) 94 | end 95 | 96 | defp calculate_item( 97 | cluster, 98 | %Cluster{uuid: uuid} = other_cluster, 99 | cross_link_count, 100 | theta 101 | ) 102 | when is_number(cross_link_count) do 103 | measure = 104 | ClusterMergeCriterion.measure( 105 | cluster, 106 | other_cluster, 107 | theta, 108 | cross_link_count 109 | ) 110 | 111 | {measure, cross_link_count, uuid} 112 | end 113 | 114 | defp calculate_item( 115 | cluster, 116 | %Cluster{uuid: uuid} = other_cluster, 117 | link_matrix, 118 | theta 119 | ) do 120 | {measure, cross_link_count} = 121 | link_matrix 122 | |> ClusterMergeCriterion.measure(cluster, other_cluster, theta) 123 | 124 | {measure, cross_link_count, uuid} 125 | end 126 | 127 | defp remove_empty_links(items) do 128 | items 129 | |> Enum.filter(fn {_, cross_link_count, _} -> 130 | cross_link_count != 0 131 | end) 132 | end 133 | 134 | defp sort(items) do 135 | items 136 | |> Enum.sort_by(fn {measure, _, _} -> 137 | -measure 138 | end) 139 | end 140 | end 141 | -------------------------------------------------------------------------------- /test/rock/cluster_merge_criterion_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Rock.ClusterMergeCriterionTest do 2 | use ExUnit.Case 3 | 4 | alias Rock.Utils 5 | alias Rock.ClusterMergeCriterion 6 | alias Rock.Links 7 | alias Rock.Struct.Cluster 8 | alias Rock.NeighbourCriterion 9 | 10 | test "counts number of cross links (example 1)" do 11 | points = 12 | [ 13 | {"point1", ["1", "2", "3", "4", "5"]}, 14 | {"point2", ["1", "2", "7", "9"]}, 15 | {"point3", ["1", "9"]} 16 | ] 17 | |> Utils.internalize_points() 18 | 19 | cluster1 = 20 | points 21 | |> Enum.at(0) 22 | |> List.wrap() 23 | |> Cluster.new() 24 | 25 | cluster2 = 26 | points 27 | |> List.delete_at(0) 28 | |> Cluster.new() 29 | 30 | neighbour_criterion = NeighbourCriterion.new(0.2) 31 | link_matrix = Links.matrix(points, neighbour_criterion) 32 | 33 | count = ClusterMergeCriterion.count_cross_links(link_matrix, cluster1, cluster2) 34 | 35 | ^count = Enum.at(link_matrix, 0) |> Enum.reduce(0, fn x, acc -> x + acc end) 36 | end 37 | 38 | test "counts number of cross links (example 2)" do 39 | points = 40 | [ 41 | {"point1", ["1", "2", "3"]}, 42 | {"point2", ["1", "2", "4"]}, 43 | {"point3", ["1", "2", "5"]}, 44 | {"point4", ["1", "3", "4"]}, 45 | {"point5", ["1", "3", "5"]}, 46 | {"point6", ["1", "4", "5"]}, 47 | {"point7", ["2", "3", "4"]}, 48 | {"point8", ["2", "3", "5"]}, 49 | {"point9", ["2", "4", "5"]}, 50 | {"point10", ["3", "4", "5"]}, 51 | {"point11", ["1", "2", "6"]}, 52 | {"point12", ["1", "2", "7"]}, 53 | {"point13", ["1", "6", "7"]}, 54 | {"point14", ["2", "6", "7"]} 55 | ] 56 | |> Utils.internalize_points() 57 | 58 | cluster1 = 59 | points 60 | |> Enum.at(0) 61 | |> List.wrap() 62 | |> Cluster.new() 63 | |> Cluster.add_point(points |> Enum.at(2)) 64 | 65 | cluster2 = 66 | points 67 | |> List.delete_at(0) 68 | |> List.delete_at(1) 69 | |> Cluster.new() 70 | 71 | neighbour_criterion = NeighbourCriterion.new(0.5) 72 | link_matrix = Links.matrix(points, neighbour_criterion) 73 | count = ClusterMergeCriterion.count_cross_links(link_matrix, cluster1, cluster2) 74 | 75 | first_row = 76 | link_matrix 77 | |> Enum.at(0) 78 | |> Enum.reduce(0, fn x, acc -> x + acc end) 79 | 80 | expected_count = 81 | link_matrix 82 | |> Enum.at(2) 83 | |> Enum.reduce(first_row, fn x, acc -> x + acc end) 84 | 85 | ^count = expected_count 86 | end 87 | 88 | test "calculates cluster merge critertion" do 89 | points = 90 | [ 91 | {"point1", ["1", "2", "3"]}, 92 | {"point2", ["1", "2", "4"]}, 93 | {"point3", ["1", "2", "5"]}, 94 | {"point4", ["1", "3", "4"]}, 95 | {"point5", ["1", "3", "5"]}, 96 | {"point6", ["1", "4", "5"]}, 97 | {"point7", ["2", "3", "4"]}, 98 | {"point8", ["2", "3", "5"]}, 99 | {"point9", ["2", "4", "5"]}, 100 | {"point10", ["3", "4", "5"]}, 101 | {"point11", ["1", "2", "6"]}, 102 | {"point12", ["1", "2", "7"]}, 103 | {"point13", ["1", "6", "7"]}, 104 | {"point14", ["2", "6", "7"]} 105 | ] 106 | |> Utils.internalize_points() 107 | 108 | cluster1 = 109 | points 110 | |> Enum.at(0) 111 | |> List.wrap() 112 | |> Cluster.new() 113 | |> Cluster.add_point(points |> Enum.at(2)) 114 | 115 | cluster2 = 116 | points 117 | |> List.delete_at(0) 118 | |> List.delete_at(1) 119 | |> Cluster.new() 120 | 121 | neighbour_criterion = NeighbourCriterion.new(0.5) 122 | link_matrix = Links.matrix(points, neighbour_criterion) 123 | 124 | result = ClusterMergeCriterion.measure(link_matrix, cluster1, cluster2, 0.5) 125 | 126 | ^result = {6.9506352159723646, 106} 127 | end 128 | end 129 | -------------------------------------------------------------------------------- /test/rock/struct/heap_test.exs: -------------------------------------------------------------------------------- 1 | defmodule Rock.Struct.HeapTest do 2 | use ExUnit.Case 3 | 4 | alias Rock.Struct.Point 5 | alias Rock.Struct.Heap 6 | alias Rock.Struct.Cluster 7 | alias Rock.Test.TestFactory 8 | 9 | setup do 10 | points = [ 11 | Point.new("1", ["1", "2", "3"], 0), 12 | Point.new("2", ["1", "2", "4"], 1), 13 | Point.new("3", ["1", "2", "5"], 2), 14 | Point.new("4", ["1", "3", "4"], 3), 15 | Point.new("5", ["1", "3", "5"], 4), 16 | Point.new("6", ["1", "4", "5"], 5), 17 | Point.new("7", ["2", "3", "4"], 6), 18 | Point.new("8", ["2", "3", "5"], 7), 19 | Point.new("9", ["2", "4", "5"], 8), 20 | Point.new("10", ["3", "4", "5"], 9), 21 | Point.new("11", ["1", "2", "6"], 10), 22 | Point.new("12", ["1", "2", "7"], 11), 23 | Point.new("13", ["1", "6", "7"], 12), 24 | Point.new("14", ["2", "6", "7"], 13) 25 | ] 26 | 27 | link_matrix = [ 28 | [0, 7, 7, 5, 5, 4, 5, 5, 4, 4, 5, 5, 2, 2], 29 | [0, 0, 7, 5, 4, 5, 5, 4, 5, 4, 5, 5, 2, 2], 30 | [0, 0, 0, 4, 5, 5, 4, 5, 5, 4, 5, 5, 2, 2], 31 | [0, 0, 0, 0, 5, 5, 5, 4, 4, 5, 2, 2, 0, 0], 32 | [0, 0, 0, 0, 0, 5, 4, 5, 4, 5, 2, 2, 0, 0], 33 | [0, 0, 0, 0, 0, 0, 4, 4, 5, 5, 2, 2, 0, 0], 34 | [0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 2, 2, 0, 0], 35 | [0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 2, 2, 0, 0], 36 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 2, 2, 0, 0], 37 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 38 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 4, 4], 39 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4], 40 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4], 41 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 42 | ] 43 | 44 | theta = 0.5 45 | 46 | point_clusters = 47 | points 48 | |> Enum.chunk_by(fn %Point{attributes: attrs} -> 49 | attrs |> Enum.at(0) == "1" 50 | end) 51 | 52 | clusters = 53 | point_clusters 54 | |> Enum.map(&Cluster.new(&1)) 55 | 56 | cluster = clusters |> Enum.at(0) 57 | clusters = clusters |> List.delete_at(0) 58 | 59 | { 60 | :ok, 61 | [ 62 | cluster: cluster, 63 | clusters: clusters, 64 | link_matrix: link_matrix, 65 | theta: theta 66 | ] 67 | } 68 | end 69 | 70 | test "initializes heap", 71 | %{ 72 | cluster: cluster = %Cluster{uuid: cluster_uuid}, 73 | clusters: clusters, 74 | link_matrix: link_matrix, 75 | theta: theta 76 | } do 77 | heap = cluster |> Heap.new(clusters, link_matrix, theta) 78 | 79 | %Heap{cluster: %Cluster{uuid: ^cluster_uuid}, items: items} = heap 80 | 81 | clusters 82 | |> Enum.each(fn %Cluster{uuid: uuid} -> 83 | assert items 84 | |> Enum.any?(fn {_, _, item_uuid} -> 85 | item_uuid == uuid 86 | end) 87 | end) 88 | end 89 | 90 | test "deletes item from heap" do 91 | items = [ 92 | {10, 15, UUID.uuid4()}, 93 | item = {9, 14, uuid = UUID.uuid4()}, 94 | {6, 12, UUID.uuid4()}, 95 | {5, 10, UUID.uuid4()} 96 | ] 97 | 98 | heap = TestFactory.create(:heap, items) 99 | 100 | %Heap{items: new_items} = heap |> Heap.remove_item(uuid) 101 | 102 | refute new_items |> Enum.member?(item) 103 | end 104 | 105 | test "adds new item to heap", 106 | %{cluster: cluster, clusters: clusters, link_matrix: link_matrix, theta: theta} do 107 | new_cluster = %Cluster{uuid: uuid} = clusters |> Enum.at(0) 108 | clusters = clusters |> List.delete_at(0) 109 | heap = cluster |> Heap.new(clusters, link_matrix, theta) 110 | cross_link_count = 10 111 | 112 | %Heap{items: items} = 113 | heap 114 | |> Heap.add_item(new_cluster, cross_link_count, theta) 115 | 116 | assert items 117 | |> Enum.any?(fn {_, _, cluster_uuid} -> 118 | cluster_uuid == uuid 119 | end) 120 | end 121 | 122 | test "finds item in heap", 123 | %{cluster: cluster, clusters: clusters, link_matrix: link_matrix, theta: theta} do 124 | %Cluster{uuid: uuid} = clusters |> Enum.at(0) 125 | heap = cluster |> Heap.new(clusters, link_matrix, theta) 126 | 127 | item = heap |> Heap.find_item(uuid) 128 | 129 | assert item 130 | end 131 | end 132 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ROCK 2 | 3 | ROCK: A Robust Clustering Algorithm for Categorical Attributes 4 | 5 | The algorithm's description http://theory.stanford.edu/~sudipto/mypapers/categorical.pdf 6 | 7 | ## Installation 8 | 9 | The easiest way to add Rock to your project is by [using Mix](http://elixir-lang.org/getting-started/mix-otp/introduction-to-mix.html). 10 | 11 | Add `:rock` as a dependency to your project's `mix.exs`: 12 | 13 | ```elixir 14 | defp deps do 15 | [ 16 | {:rock, "~> 0.1.2"} 17 | ] 18 | end 19 | ``` 20 | 21 | And run: 22 | 23 | $ mix deps.get 24 | 25 | ## Basic Usage 26 | 27 | To clusterize points using the Rock algorithm you should use Rock.clusterize/4 with the arguments: 28 | 29 | * `points`, points that will be clusterized 30 | * `number_of_clusters`, the number of desired clusters. 31 | * `theta`, neighborhood parameter in the range [0,1). Default value is 0.5. 32 | * `similarity_function`, distance function to use. Jaccard Coefficient is used by default. 33 | 34 | ```elixir 35 | 36 | ## Examples 37 | 38 | points = 39 | [ 40 | {"point1", ["1", "2", "3"]}, 41 | {"point2", ["1", "2", "4"]}, 42 | {"point3", ["1", "2", "5"]}, 43 | {"point4", ["1", "3", "4"]}, 44 | {"point5", ["1", "3", "5"]}, 45 | {"point6", ["1", "4", "5"]}, 46 | {"point7", ["2", "3", "4"]}, 47 | {"point8", ["2", "3", "5"]}, 48 | {"point9", ["2", "4", "5"]}, 49 | {"point10", ["3", "4", "5"]}, 50 | {"point11", ["1", "2", "6"]}, 51 | {"point12", ["1", "2", "7"]}, 52 | {"point13", ["1", "6", "7"]}, 53 | {"point14", ["2", "6", "7"]} 54 | ] 55 | 56 | # Example 1 57 | 58 | Rock.clusterize(points, 5, 0.4) 59 | [ 60 | [ 61 | {"point4", ["1", "3", "4"]}, 62 | {"point5", ["1", "3", "5"]}, 63 | {"point6", ["1", "4", "5"]}, 64 | {"point10", ["3", "4", "5"]}, 65 | {"point7", ["2", "3", "4"]}, 66 | {"point8", ["2", "3", "5"]} 67 | ], 68 | [ 69 | {"point11", ["1", "2", "6"]}, 70 | {"point12", ["1", "2", "7"]}, 71 | {"point1", ["1", "2", "3"]}, 72 | {"point2", ["1", "2", "4"]}, 73 | {"point3", ["1", "2", "5"]} 74 | ], 75 | [ 76 | {"point9", ["2", "4", "5"]} 77 | ], 78 | [ 79 | {"point13", ["1", "6", "7"]} 80 | ], 81 | [ 82 | {"point14", ["2", "6", "7"]} 83 | ] 84 | ] 85 | 86 | # Example 2 (with custom similarity function) 87 | 88 | similarity_function = fn( 89 | %Rock.Struct.Point{attributes: attributes1}, 90 | %Rock.Struct.Point{attributes: attributes2}) -> 91 | 92 | count1 = Enum.count(attributes1) 93 | count2 = Enum.count(attributes2) 94 | 95 | if count1 >= count2, do: (count2 - 1) / count1, else: (count1 - 1) / count2 96 | end 97 | 98 | Rock.clusterize(points, 4, 0.5, similarity_function) 99 | [ 100 | [ 101 | {"point1", ["1", "2", "3"]}, 102 | {"point2", ["1", "2", "4"]}, 103 | {"point3", ["1", "2", "5"]}, 104 | {"point4", ["1", "3", "4"]}, 105 | {"point5", ["1", "3", "5"]}, 106 | {"point6", ["1", "4", "5"]}, 107 | {"point7", ["2", "3", "4"]}, 108 | {"point8", ["2", "3", "5"]}, 109 | {"point9", ["2", "4", "5"]}, 110 | {"point10", ["3", "4", "5"]}, 111 | {"point11", ["1", "2", "6"]} 112 | ], 113 | [ 114 | {"point12", ["1", "2", "7"]} 115 | ], 116 | [ 117 | {"point13", ["1", "6", "7"]} 118 | ], 119 | [ 120 | {"point14", ["2", "6", "7"]} 121 | ] 122 | ] 123 | ``` 124 | 125 | 126 | ## Contributing 127 | 128 | 1. [Fork it!](http://github.com/ayrat555/rock/fork) 129 | 2. Create your feature branch (`git checkout -b my-new-feature`) 130 | 3. Commit your changes (`git commit -am 'Add some feature'`) 131 | 4. Push to the branch (`git push origin my-new-feature`) 132 | 5. Create new Pull Request 133 | 134 | ## Author 135 | 136 | Ayrat Badykov (@ayrat555) 137 | 138 | ## License 139 | 140 | Rock is released under the MIT License. See the LICENSE file for further details. 141 | --------------------------------------------------------------------------------