[MSSQL] Northwind

10 Feb 2021

Reading time ~9 minutes

Reference by [C#과 유니티로 만드는 MMORPG 게임 개발 시리즈]Part5: 데이터베이스

Data Analysis

Download Northwind Database

GITHUB
- DOWNLOAD Northwind

MSSQL Northwind

EXEC

EXEC sp_helpdb
- show information of databse

EXEC sp_helpdb 'Northwind';

MSSQL Northwind

INDEX

FILLFACTOR
- use only 1% of leaf page space
PAD_INDEX
- apply to FILLFACTOR middle page

CREATE INDEX Test_Inde ON Test(LastName)
WITH (FILLFACTOR = 1, PAD_INDEX = ON)

MSSQL Northwind

sys.indexes
- help to search index number

SELECT index_id, name
FROM sys.indexes
WHERE object_id = object_id('Test');

MSSQL Northwind

DBCC

DBCC
- show database data that match with the condition
IND
- show index data

DBCC IND('Northwind', 'Test', 2);

MSSQL Northwind

PAGE
- ref: [Database], [File Number], [Page Number], [Print Option]
HEAD_RID
- ref: Page Address, File ID, Slot Number

DBCC PAGE('Northwind', 1, 840, 3);
DBCC PAGE('Northwind', 1, 848, 3);
DBCC PAGE('Northwind', 1, 849, 3);

MSSQL Northwind

DBCC PAGE('Northwind', 1, 872, 3);
DBCC PAGE('Northwind', 1, 848, 3);
DBCC PAGE('Northwind', 1, 849, 3);

MSSQL Northwind

Random Access
- Access one page by one page to read
Bookmark Lookup
- search row by RID

Multiple Index

Copy table to new table

SELECT INTO
- copy the table and paste in a new table

SELECT *
INTO TestOrderDetails
FROM [Order Details];

MSSQL Northwind

EXEC

EXEC sp_helpindex
- show information of index

CREATE INDEX Index_TestOrderDetails
ON TestOrderDetails(OrderID, ProductID);

EXEC sp_helpindex 'TestOrderDetails';

MSSQL Northwind

Index performance

Multiple Index
- when you use multiple index(A, B), the index order is A and B is next level of A
- So, if you want to use index, you have to start A
- if you start B, then the process cannot use index
SEEK
- process speed is good
- use index when searching

SELECT *
FROM TestOrderDetails
WHERE OrderID = 10248 AND ProductID = 11;

SELECT *
FROM TestOrderDetails
WHERE ProductID = 11 AND OrderID = 10248;

SELECT *
FROM TestOrderDetails
WHERE OrderID = 10248;

MSSQL Northwind

SELECT LastName
INTO TestEmployees
FROM Employees;

CREATE INDEX Index_TestEmployees
ON TestEmployees(LastName);

SELECT *
FROM TestEmployees
WHERE LastName LIKE 'Bu%';

MSSQL Northwind

SCAN
- process speed is bad
- do not use index when searching

SELECT *
FROM TestOrderDetails
WHERE ProductID = 11;

MSSQL Northwind

when you use data processing, all of data will be scanned even it is index

SELECT *
FROM TestEmployees
WHERE SUBSTRING(LastName, 1, 2) = 'Bu';

MSSQL Northwind

Page Split

Page Split
- if page is full, make a new page and insert in specific position

DECLARE @i INT = 0;
WHILE @i < 50
BEGIN
	INSERT INTO TestOrderDetails
	VALUES(10248, 100 + @i, 10, 1, 0);
	SET @i = @i +1;
END

MSSQL Northwind

Clustered vs Non-Clustered

Clustered

Clustered
- Leaf Page = Data Page
- data is ordered by Clustered index

Non-Clustered

not exist Clustered Index
- data is saved in Heal Table
- Heap RIP → Heap Table To exract data

SELECT *
INTO TestOrderDetails2 
FROM [Order Details];

SELECT *
FROM TestOrderDetails2;

CREATE INDEX Index_OrderDetails
ON TestOrderDetails2(OrderID, productID);

EXEC sp_helpindex 'TestOrderDetails2';

SELECT index_id, name
FROM sys.indexes
WHERE object_id = object_id('TestOrderDetails2');

DBCC IND('Northwind', 'TestOrderDetails2', 2);

MSSQL Northwind

HEAP RID
- Heap RID([Page Address(4)] [File ID(2)][Slot(2)] ROW)
- Heap Table[{Page} {Page} {Page} {Page}]

DBCC PAGE('Northwind', 1, 928, 3);

MSSQL Northwind

exist Clustered Index
- no Heap Table. Real data is in Leaf Table
  - has real Key value of Clustered Index
- Previous index can be changed

CREATE CLUSTERED INDEX Index_OrderDetails_Clustered
ON TestOrderDetails2(OrderID);

SELECT index_id, name
FROM sys.indexes
WHERE object_id = object_id('TestOrderDetails2');

DBCC IND('Northwind', 'TestOrderDetails2', 1);
DBCC IND('Northwind', 'TestOrderDetails2', 2);

MSSQL Northwind

DBCC PAGE('Northwind', 1, 9120, 3);

MSSQL Northwind

Index Scan VS Index Seek

Index Scan vs Index Seek

Index Access Process
- Index Scan: Search LEAF PAGE sequentially
- Index Seek: Search with index

CONVERT

CONVERT
- change type of variable

CREATE TABLE TestAccess
(
	id INT NOT NULL,
	name NCHAR(50) NOT NULL,
	dummy NCHAR(1000) NULL
);

CREATE CLUSTERED INDEX TestAccess_CI
ON TestAccess(id);

CREATE NONCLUSTERED INDEX TestAccess_NCI
ON TestAccess(name);

DECLARE @i INT;
SET @i = 1;

DECLARE @i INT;
SET @i = 1;

WHILE(@i <= 500)
BEGIN
	INSERT INTO TestAccess
	VALUES(@i, 'Name' + CONVERT(VARCHAR, @i), 'Hello World' + CONVERT(VARCHAR, @i));
	SET @i = @i + 1;
END

EXEC sp_helpindex 'TestAccess';

SELECT index_id, name
FROM sys.indexes
WHERE object_id = object_id('TestAccess');

MSSQL Northwind

Clustered

DBCC IND('Northwind', 'TestAccess', 1);

MSSQL Northwind

Non-Clustered

DBCC IND('Northwind', 'TestAccess', 2);

MSSQL Northwind

SET STATISTICS

TIME
- Time for Logical read
IO
- Number of pages read to find actual data

SET STATISTICS TIME ON;
SET STATISTICS IO ON;

Scan Process

All table
- INDEX SCAN

SELECT *
FROM TestAccess;

MSSQL Northwind

Non-Clustered
INDEX SCAN + KEY LOOKUP

\[\begin{align*} Step = N * 2 + a \end{align*}\]

N is number of rows which you want to search
2 is for root → page stesp in Non-Clustered

SELECT TOP 5 *
FROM TestAccess
ORDER BY name;

MSSQL Northwind

Seek Process

Clustered
- INDEX SEEK
- root → page
- 2 steps

SELECT *
FROM TestAccess
WHERE id = 104;

MSSQL Northwind

Non-Clustered
- INDEX SEEK + KEY LOOKUP
- root → page → clustered root → page
- page has Key of clustered index
- 4 steps

SELECT *
FROM TestAccess
WHERE name = 'name5';

MSSQL Northwind

Bookmark Lookup

SET STATISTICS

PROFILE
- show the actual order in which it was executed

SELECT *
INTO TestOrders
FROM Orders;

SELECT *
FROM TestOrders;

CREATE NONCLUSTERED INDEX Orders_Index01
ON TestOrders(CustomerID);

SELECT index_id, name
FROM sys.indexes
WHERE object_id = object_id('TestOrders');

DBCC IND('Northwind', 'TestOrders', 2);

SET STATISTICS TIME ON;
SET STATISTICS IO ON;
SET STATISTICS PROFILE ON;

WITH INDEX

WITH INDEX
- force index usage
- Database choose whether index usage or not
- if it is more better index not usage, database can choose SCAN

SELECT *
FROM TestOrders
WHERE CustomerID = 'QUICK';

MSSQL Northwind

SELECT *
FROM TestOrders WITH(INDEX(Orders_Index01))
WHERE CustomerID = 'QUICK';

MSSQL Northwind

Decrease Lookup

Covered Index
- include all column to search in index
- but it can putsa load on the DML(CREATE, DELETE, UPDATE)

CREATE NONCLUSTERED INDEX Orders_Index02
ON TestOrders(CustomerID, ShipVia);

SELECT *
FROM TestOrders WITH(INDEX(Orders_Index02))
WHERE CustomerID = 'QUICK' AND ShipVia = 3;

MSSQL Northwind

INCLUDE
- give a hint to index by INCLUDE

CREATE NONCLUSTERED INDEX Orders_Index03
ON TestOrders(CustomerID) INCLUDE (ShipVia);

MSSQL Northwind

Clustered
- Only one column
- it can put a load if there is Non-Clustred

Index Column Order

Create dummy data

USE Northwind;

SELECT *
INTO TestOrders
FROM Orders;

DECLARE @i INT = 1;
DECLARE @emp INT;

SELECT @emp = MAX(EmployeeID) 
FROM Orders;

SELECT *
FROM TestOrders;

WHILE(@i < 1000)
BEGIN
	INSERT INTO TestOrders(CustomerID, EmployeeID, OrderDate)
	SELECT CustomerID, @emp + @i, OrderDate
	FROM Orders;
	SET @i = @i + 1;
END

SELECT COUNT(*)
FROM TestOrders;

MSSQL Northwind

Create Index

idx_emp_ord creates index by EmployeeID at first, and then by OrderDate
idx_ord_emp creates index by OrderDate at first, and then by EmployeeID

CREATE NONCLUSTERED INDEX idx_emp_ord
ON TestOrders(EmployeeID, OrderDate);

CREATE NONCLUSTERED INDEX idx_ord_emp
ON TestOrders(OrderDate, EmployeeID);

SET STATISTICS TIME ON;
SET STATISTICS IO ON;

Compare

Case 1: no range in search
- there is same logiacal read time
- and both use index to search

SELECT *
FROM TestOrders WITH(INDEX(idx_emp_ord))
WHERE EmployeeID = 1 AND OrderDate = CONVERT(DATETIME, '19970101');

MSSQL Northwind

SELECT *
FROM TestOrders WITH(INDEX(idx_ord_emp))
WHERE EmployeeID = 1 AND OrderDate = CONVERT(DATETIME, '19970101');

MSSQL Northwind

first, database searches the page which have a real data matching with condition
and check the next row
if the next row is not matched with condition, then return
So, idx_emp_ord and idx_emp_ord searched same count of row
this is because their search order did not affect to search

SELECT *
FROM TestOrders
ORDER BY EmployeeID, OrderDate;

MSSQL Northwind

SELECT *
FROM TestOrders
ORDER BY OrderDate, EmployeeID;

MSSQL Northwind

Case 2: range in search
- both use index also
- but both of logical read time are different

-- Same with next SELECT statement ---------------
SELECT *
FROM TestOrders WITH(INDEX(idx_emp_ord))
WHERE EmployeeID = 1 AND OrderDate >= '19970101' AND OrderDate <= '19970103';
--------------------------------------------------

SELECT *
FROM TestOrders WITH(INDEX(idx_emp_ord))
WHERE EmployeeID = 1 AND OrderDate BETWEEN '19960701' AND '19970103';

MSSQL Northwind

SELECT *
FROM TestOrders WITH(INDEX(idx_ord_emp))
WHERE EmployeeID = 1 AND OrderDate BETWEEN '19960701' AND '19970103';

MSSQL Northwind

in idx_emp_ord, database searches the page which match with 1 in EmployeeID, and then match with between 19970101 and 19970103 in OrderDate
and check next row
it is ordered by EmployeeID first, so next row must have same or bigger EmployeeID
So, idx_emp_ord searched only 3 rows

SELECT *
FROM TestOrders
ORDER BY EmployeeID, OrderDate;

MSSQL Northwind

in idx_ord_emp, database searches the page wich match with between 19970101 and 19970103 in OrderDate, and then match with 1 in EmployeeID
and check next row
it is ordered by OrderDate first, So if the OrderDate is in between 19970101 and 19970103, then database have to check EmployeeID in the rows
So idx_ord_emp searched more then 3 rows

SELECT *
FROM TestOrders
ORDER BY OrderDate, EmployeeID;

MSSQL Northwind

IN-LIST

IN()
- if between range is small, then change the range to IN()

SET STATISTICS PROFILE ON;

SELECT *
FROM TestOrders WITH(INDEX(idx_ord_emp))
WHERE EmployeeID = 1 AND OrderDate IN('19960101','19970102', '19970103');

MSSQL Northwind

SELECT *
FROM TestOrders WITH(INDEX(idx_emp_ord))
WHERE EmployeeID = 1 AND OrderDate IN('19960101','19970102', '19970103');

MSSQL Northwind

Join

Hash Join

Hash Join does not need to sort → The more data to Merge, The better to Hash
not be infected by Index
- Hash is better than NL/Merge(depends on situation)
- But you have to concider the cost of hash table(process time become bigger → Index)
Random Access X
smaller data is good to make Hash Table

SELECT *
FROM TestOrders AS o
	INNER JOIN TestCustomers AS c
	ON o.CustomerID = c.CustomerID;

MSSQL Northwind

Hash Join in C#

  using System;
  using System.Collections.Generic;

  namespace HashJoin
  {
    class Player
    {
      public int playerId;
    }

    class Salary
    {
      public int playerId;
    }

    class HashTable
    {
      int _bucketCount;
      List<int>[] _buckets;

      public HashTable(int bucketCount = 100)
      {
        _bucketCount = bucketCount;
        _buckets = new List<int>[bucketCount];

        for (int i = 0; i < bucketCount; i++)
          _buckets[i] = new List<int>();
      }

      public void Add(int value)
      {
        int key = value % _bucketCount;
        _buckets[key].Add(value);
      }

      public bool Find(int value)
      {
        int key = value % _bucketCount;

        // _buckets[key].Contains(value);
        foreach(int v in _buckets[key])
        {
          if (v == value)
            return true;
        }
        return false;
      }
    }

    class Program
    {
      static void Main(string[] args)
      {
        Random rand = new Random();

        List<Player> players = new List<Player>();
        for (int i = 0; i < 1000; i++)
        {
          if (rand.Next(0, 2) == 0)
            continue;

          players.Add(new Player() { playerId = i });
        }

        List<Salary> salaries = new List<Salary>();
        for (int i = 0; i < 1000; i++)
        {
          if (rand.Next(0, 2) == 0)
            continue;

          salaries.Add(new Salary() { playerId = i });
        }

        // TEMP HashTable
        /*Dictionary<int, Salary> hash = new Dictionary<int, Salary>();       
        foreach (Salary s in salaries)
          hash.Add(s.playerId, s);

        List<int> result = new List<int>();
        foreach(Player p in players)
        {
          if (hash.ContainsKey(p.playerId))
            result.Add(p.playerId);
        }*/

        HashTable hash = new HashTable();
        foreach (Salary s in salaries)
          hash.Add(s.playerId);

        List<int> result = new List<int>();
        foreach(Player p in players)
        {
          if (hash.Find(p.playerId))
            result.Add(p.playerId);
        }
      }
    }
  }

Download