SQL Grouper

Published by

on

This code uses a technique to map out related entities, placing in the same group. This is useful for applications such as identifying entities that share one or more relationships.

SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
-- =============================================
-- Author:		Jay Kidd
-- Create date: 2024-05-23
-- Description:	Rationalises Entity relationships into a common network
-- =============================================
CREATE OR ALTER PROCEDURE [utl].[Entity Grouping] (
	@Relations [utl].[Entity Relations In] READONLY,
	@MaxIterations TINYINT = 255 -- max number of UPDATE passes until failure
)
AS
BEGIN

	/* LOGIC:
		1. Make relationships symmetric, so {A->B} becomes {A-B} + {B->A} to simplify network exploration query to a single JOIN
		2. Pick an initial Group that is the lowest id of the entities. End-game is the Group id will be lowest Entity id within the network.
		3. Iterate over the relationships:
			Look for the lowest group a entity is associated with. If lower than its current group, set the lower value (rationalise the network).
			For example, relationships {A->B} and {B->C} would be processed as:
				Iteration 1:
					input  : {A-B}=>'A', {B-A}=>'A', {B-C}=>'B', {C-B}=>'B'
					process: {A}'s lowest group is currently 'A', so no change. {B}'s lowest group is 'A', but {B-C}=>'B' so needs to be updated. {C}'s lowest group is currently 'B', so no change.
					output : {A-B}=>'A', {B-A}=>'A', {B-C}=>*'A', {C-B}=>'B'
					changes: 1
				Iteration 2:
					input  : {A-B}=>'A', {B-A}=>'A', {B-C}=>'A', {C-B}=>'B'
					process: {A}'s lowest group is currently 'A', so no change. {B}'s lowest group is 'A', so no change. {C}'s lowest group is currently 'A', but {C-B}=>'B' so needs to be updated.
					output : {A-B}=>'A', {B-A}=>'A', {B-C}=>'A', {C-B}=>*'A'
					changes: 1
				Iteration 3:
					input  : {A-B}=>'A', {B-A}=>'A', {B-C}=>'A', {C-B}=>'A'
					process: {A}'s lowest group is currently 'A', so no change. {B}'s lowest group is 'A', so no change. {C}'s lowest group is currently 'B', so no change.
					output : {A-B}=>'A', {B-A}=>'A', {B-C}=>'A', {C-B}=>'A'
					changes: 0 = *STOP* Network is stable!
		4. Ensure the maximum number of iterations wasn't exceeded; if so error
		5. Return distinct entities and their lowest Entity group
	*/


	-- SET NOCOUNT ON added to prevent extra result sets from interfering with SELECT statements.
	SET NOCOUNT ON;

	-- setup variables
	DECLARE @msg NVARCHAR(max);

    -- create an analysis table; make it a temp table to support large datasets that don't fit in memory
	DROP TABLE IF EXISTS #entity_grouping_analysis_internal;
	CREATE TABLE #entity_grouping_internal (
		[Group] UNIQUEIDENTIFIER NOT NULL,
		[Entity A] UNIQUEIDENTIFIER NOT NULL,
		[Entity B] UNIQUEIDENTIFIER NOT NULL,
		PRIMARY KEY CLUSTERED ([Entity A], [Entity B])
	);

	-- populate it with initial values
	INSERT INTO
		#entity_grouping_internal
	SELECT DISTINCT
		CASE WHEN [Entity A] < [Entity B] THEN [Entity A] ELSE [Entity B] END AS [Group], -- choose the lowest id
		[Entity A],
		[Entity B]
	FROM
		@Relations AS r
	UNION -- needs to be symmetric
	SELECT DISTINCT
		CASE WHEN [Entity A] < [Entity B] THEN [Entity A] ELSE [Entity B] END AS [Group], -- choose the lowest id
		[Entity B] AS [Entity A],
		[Entity A] AS [Entity B]
	FROM
		@Relations AS r


	-- create a table to track changes
	DECLARE @chg TABLE ([chg] BIT);
	
	-- cycle until no updates
	DECLARE @chgs INT = 0, @iterations TINYINT = 0;
	WHILE @iterations < @MaxIterations
		BEGIN
			-- detect wider relationships
			;WITH relations AS (
				-- look for a lower GROUP within the network
				SELECT
					c.[Group],
					MIN(l.[Group]) AS [Lower]
				FROM
					#entity_grouping_internal AS c
						INNER JOIN
					#entity_grouping_internal AS l 
						ON c.[Entity A] = l.[Entity B]
				GROUP BY
					c.[Group]
				HAVING
					MIN(l.[Group]) < c.[Group]
			)
			-- update
			UPDATE
				c
			SET
				c.[Group] = r.[Lower] -- update the group to the lower group id
			OUTPUT 1 INTO @chg -- track how many record changes
			FROM
				#entity_grouping_internal AS c
					INNER JOIN
				relations AS r
					ON c.[Group] = r.[Group] -- group

			-- check if any changes
			SELECT @chgs = COUNT(*) FROM @chg
			SET @msg = ' Iteration ' + FORMAT(@iterations, '#,##0') + ' with ' + FORMAT(@chgs,'#,##0') + ' changes...'
			RAISERROR(@msg, 0, 0) WITH NOWAIT;
			IF @chgs = 0 BREAK; -- if no changes, break out of looping - the network is now rational and stable!
			
			-- prepare for another cycle
			DELETE FROM @chg;
			SET @iterations += 1;
		END

	-- check of max number of iterations failure
	IF @MaxIterations <= @iterations
		THROW 50000, 'The maximum number of grouping UPDATE iterations was hit, without the network stabilising!', 0
	
	-- show the result
	SELECT DISTINCT
		c.[Group],
		c.[Entity A] AS [Entity Member]
	FROM
		#entity_grouping_internal AS c

END
GO

Using this code:

/*** CREATE TYPES - This is a one off... ***/
-- create a table type for passing relationships  in
-- DROP TYPE [utl].[Entity Relations In]
IF TYPE_ID(N'utl.Entity Relations In') IS NULL
	CREATE TYPE [utl].[Entity Relations In] AS TABLE (
		[Entity A] UNIQUEIDENTIFIER NOT NULL,
		[Entity B] UNIQUEIDENTIFIER NOT NULL,
		PRIMARY KEY CLUSTERED ([Entity A], [Entity B]),
		CHECK ([Entity A] != [Entity B])
	);
-- create a table type for receiving relationships back
-- DROP TYPE [utl].[Entity Groups Out]
IF TYPE_ID(N'utl.Entity Groups Out') IS NULL
	CREATE TYPE [utl].[Entity Groups Out] AS TABLE (
		[Group] UNIQUEIDENTIFIER NOT NULL,
		[Entity Member] UNIQUEIDENTIFIER NOT NULL PRIMARY KEY CLUSTERED
	);


/*** CREATE TEST DATA ***/
SET NOCOUNT ON;
DECLARE @TestData AS [utl].[Entity Relations In];
INSERT INTO @TestData VALUES
	-- forward relations A-B-C and one back D-C
	('AAAAAAAA-0000-0000-0000-000000000000', 'BBBBBBBB-0000-0000-0000-000000000000'),
	('BBBBBBBB-0000-0000-0000-000000000000', 'CCCCCCCC-0000-0000-0000-000000000000'),
	('DDDDDDDD-0000-0000-0000-000000000000', 'CCCCCCCC-0000-0000-0000-000000000000'),
	-- back relations F-E
	('FFFFFFFF-0000-0000-0000-000000000000', 'EEEEEEEE-0000-0000-0000-000000000000'),
	-- groups of relations 1-3, 2-4, 5-6, 5-2, 1-6 -- checks to ensure groups don't split / normalise together
	('11111111-0000-0000-0000-000000000000', '33333333-0000-0000-0000-000000000000'),
	('22222222-0000-0000-0000-000000000000', '44444444-0000-0000-0000-000000000000'),
	('55555555-0000-0000-0000-000000000000', '66666666-0000-0000-0000-000000000000'),
	('55555555-0000-0000-0000-000000000000', '22222222-0000-0000-0000-000000000000'),
	('11111111-0000-0000-0000-000000000000', '66666666-0000-0000-0000-000000000000');


-- perform the grouping
DECLARE @Grouped AS [utl].[Entity Groups Out];
INSERT INTO
	@Grouped
EXEC [utl].[Entity Grouping] @Relations=@TestData


-- display: In data
SELECT * FROM @TestData
-- display: Grouped data
SELECT * FROM @Grouped

This would create the output:

Leave a comment

Previous Post