Data Analysis
Download Northwind Database
- GITHUB
- DOWNLOAD Northwind
data:image/s3,"s3://crabby-images/50da6/50da6f200ce004c094291f5c7e4799c3e1d80f64" alt=""
EXEC
- EXEC sp_helpdb
- show information of databse
EXEC sp_helpdb 'Northwind';
data:image/s3,"s3://crabby-images/370d9/370d913d2f1e2a42f1aa09d21bf5aff77256eb29" alt=""
INDEX
- FILLFACTOR
- use only 1% of leaf page space
- PAD_INDEX
- apply to FILLFACTOR middle page
CREATE INDEX Test_Inde ON Test(LastName)
WITH (FILLFACTOR = 1, PAD_INDEX = ON)
data:image/s3,"s3://crabby-images/b5aaa/b5aaa966db0a7fac7945f43d3ae9efc6600bdcc2" alt=""
- sys.indexes
- help to search index number
SELECT index_id, name
FROM sys.indexes
WHERE object_id = object_id('Test');
data:image/s3,"s3://crabby-images/8aeb0/8aeb0ff2ff2b5ae44adc25c3e761d55e6d5c2c4a" alt=""
DBCC
- DBCC
- show database data that match with the condition
- IND
- show index data
DBCC IND('Northwind', 'Test', 2);
data:image/s3,"s3://crabby-images/412f6/412f61edd77e6f7511d2a9b768cd38ca783f95f3" alt=""
data:image/s3,"s3://crabby-images/c26da/c26da954161687ac748c2488e8f52d7e0d87aea1" alt=""
- PAGE
- ref: [Database], [File Number], [Page Number], [Print Option]
- HEAD_RID
- ref: Page Address, File ID, Slot Number
DBCC PAGE('Northwind', 1, 840, 3);
DBCC PAGE('Northwind', 1, 848, 3);
DBCC PAGE('Northwind', 1, 849, 3);
data:image/s3,"s3://crabby-images/c1273/c1273d0c51f3078240bc306e1f8add192bcdad05" alt=""
DBCC PAGE('Northwind', 1, 872, 3);
DBCC PAGE('Northwind', 1, 848, 3);
DBCC PAGE('Northwind', 1, 849, 3);
data:image/s3,"s3://crabby-images/ab959/ab959a91158c5229b0dba0da3e8668464b70e952" alt=""
- Random Access
- Access one page by one page to read
- Bookmark Lookup
- search row by RID
Multiple Index
Copy table to new table
- SELECT INTO
- copy the table and paste in a new table
SELECT *
INTO TestOrderDetails
FROM [Order Details];
data:image/s3,"s3://crabby-images/f37d0/f37d0687214303c5e4d48e17739e16abdb1f78c3" alt=""
EXEC
- EXEC sp_helpindex
- show information of index
CREATE INDEX Index_TestOrderDetails
ON TestOrderDetails(OrderID, ProductID);
EXEC sp_helpindex 'TestOrderDetails';
data:image/s3,"s3://crabby-images/fd08e/fd08e2565dfef093d7c3145e599bba1e18239151" alt=""
Index performance
- Multiple Index
- when you use multiple index(A, B), the index order is A and B is next level of A
- So, if you want to use index, you have to start A
- if you start B, then the process cannot use index
- SEEK
- process speed is good
- use index when searching
SELECT *
FROM TestOrderDetails
WHERE OrderID = 10248 AND ProductID = 11;
SELECT *
FROM TestOrderDetails
WHERE ProductID = 11 AND OrderID = 10248;
SELECT *
FROM TestOrderDetails
WHERE OrderID = 10248;
data:image/s3,"s3://crabby-images/7259a/7259a3f3cd6e9bd0ebf7f6f261b3b4869272d7d1" alt=""
data:image/s3,"s3://crabby-images/08082/08082d447a987bfd6750839f3158c7f352ebbda4" alt=""
data:image/s3,"s3://crabby-images/32047/32047ce9b96901ffa5ae7294aaa7236ddc35ef57" alt=""
SELECT LastName
INTO TestEmployees
FROM Employees;
CREATE INDEX Index_TestEmployees
ON TestEmployees(LastName);
SELECT *
FROM TestEmployees
WHERE LastName LIKE 'Bu%';
data:image/s3,"s3://crabby-images/2ba1a/2ba1a652e75817ee7f435762062ee406ac389030" alt=""
- SCAN
- process speed is bad
- do not use index when searching
SELECT *
FROM TestOrderDetails
WHERE ProductID = 11;
data:image/s3,"s3://crabby-images/dd146/dd146f55b47b83821b25d13eae6cec8f3cd38b3c" alt=""
- when you use data processing, all of data will be scanned even it is index
SELECT *
FROM TestEmployees
WHERE SUBSTRING(LastName, 1, 2) = 'Bu';
data:image/s3,"s3://crabby-images/44001/440016b3e0b3fc4e4545a08b7f8484ffe5e01e98" alt=""
Page Split
- Page Split
- if page is full, make a new page and insert in specific position
DECLARE @i INT = 0;
WHILE @i < 50
BEGIN
INSERT INTO TestOrderDetails
VALUES(10248, 100 + @i, 10, 1, 0);
SET @i = @i +1;
END
data:image/s3,"s3://crabby-images/b02fe/b02fead8521224a1cdcd5206f08a46411684f49d" alt=""
data:image/s3,"s3://crabby-images/29353/293535bb977aac3087dfd00c8fdad63aedb9d313" alt=""
data:image/s3,"s3://crabby-images/aa972/aa97229037452d2f852eeec691a97170bd8a4f35" alt=""
Clustered vs Non-Clustered
Clustered
- Clustered
- Leaf Page = Data Page
- data is ordered by Clustered index
Non-Clustered
- not exist Clustered Index
- data is saved in Heal Table
- Heap RIP → Heap Table To exract data
SELECT *
INTO TestOrderDetails2
FROM [Order Details];
SELECT *
FROM TestOrderDetails2;
CREATE INDEX Index_OrderDetails
ON TestOrderDetails2(OrderID, productID);
EXEC sp_helpindex 'TestOrderDetails2';
SELECT index_id, name
FROM sys.indexes
WHERE object_id = object_id('TestOrderDetails2');
DBCC IND('Northwind', 'TestOrderDetails2', 2);
data:image/s3,"s3://crabby-images/9c49c/9c49cd0483d8bfadbe21092024f0711a250c0d12" alt=""
data:image/s3,"s3://crabby-images/174df/174df89827276a5302b19bf6a0e033cc6a75b202" alt=""
- HEAP RID
- Heap RID([Page Address(4)] [File ID(2)][Slot(2)] ROW)
- Heap Table[{Page} {Page} {Page} {Page}]
DBCC PAGE('Northwind', 1, 928, 3);
data:image/s3,"s3://crabby-images/64480/64480ffdb67227387fd3790afe1acea2a78262c7" alt=""
- exist Clustered Index
- no Heap Table. Real data is in Leaf Table
- has real Key value of Clustered Index
- Previous index can be changed
- no Heap Table. Real data is in Leaf Table
CREATE CLUSTERED INDEX Index_OrderDetails_Clustered
ON TestOrderDetails2(OrderID);
SELECT index_id, name
FROM sys.indexes
WHERE object_id = object_id('TestOrderDetails2');
DBCC IND('Northwind', 'TestOrderDetails2', 1);
DBCC IND('Northwind', 'TestOrderDetails2', 2);
data:image/s3,"s3://crabby-images/7199b/7199b365acc113381253ab0540c2b7a68334d980" alt=""
data:image/s3,"s3://crabby-images/6c90a/6c90aeca7aa731cbb019a9b4acbe7fd994ca24a8" alt=""
data:image/s3,"s3://crabby-images/bcf6e/bcf6ebae42a842b9efb5b29e6e9d63e3c0cfa97d" alt=""
DBCC PAGE('Northwind', 1, 9120, 3);
data:image/s3,"s3://crabby-images/097f1/097f19579be648bc0121e01f47b0be818e75440e" alt=""
data:image/s3,"s3://crabby-images/ee328/ee328406639414ddb541baac9dd4b78c1b263368" alt=""
Index Scan VS Index Seek
Index Scan vs Index Seek
- Index Access Process
- Index Scan: Search LEAF PAGE sequentially
- Index Seek: Search with index
CONVERT
- CONVERT
- change type of variable
CREATE TABLE TestAccess
(
id INT NOT NULL,
name NCHAR(50) NOT NULL,
dummy NCHAR(1000) NULL
);
CREATE CLUSTERED INDEX TestAccess_CI
ON TestAccess(id);
CREATE NONCLUSTERED INDEX TestAccess_NCI
ON TestAccess(name);
DECLARE @i INT;
SET @i = 1;
DECLARE @i INT;
SET @i = 1;
WHILE(@i <= 500)
BEGIN
INSERT INTO TestAccess
VALUES(@i, 'Name' + CONVERT(VARCHAR, @i), 'Hello World' + CONVERT(VARCHAR, @i));
SET @i = @i + 1;
END
EXEC sp_helpindex 'TestAccess';
SELECT index_id, name
FROM sys.indexes
WHERE object_id = object_id('TestAccess');
data:image/s3,"s3://crabby-images/15b92/15b92bedc3a8c404222b87d4d40cbed7b6adc3a3" alt=""
Clustered
DBCC IND('Northwind', 'TestAccess', 1);
data:image/s3,"s3://crabby-images/2b0e8/2b0e8e7ca461337266f180518e551d6d84ec6328" alt=""
data:image/s3,"s3://crabby-images/ddea4/ddea4d626a7f49202b52392a66e680df4866d6d4" alt=""
Non-Clustered
DBCC IND('Northwind', 'TestAccess', 2);
data:image/s3,"s3://crabby-images/6a4cd/6a4cdc7e0ceb822ce0fdc30677179609c3c7b7c4" alt=""
data:image/s3,"s3://crabby-images/cb306/cb3062a7b75cab47dce33e2fc68590b1534ac87b" alt=""
SET STATISTICS
- TIME
- Time for Logical read
- IO
- Number of pages read to find actual data
SET STATISTICS TIME ON;
SET STATISTICS IO ON;
Scan Process
- All table
- INDEX SCAN
SELECT *
FROM TestAccess;
data:image/s3,"s3://crabby-images/ff11c/ff11cbe1b5f53a552f62bf5a45bda7f1201fa0aa" alt=""
data:image/s3,"s3://crabby-images/0e36a/0e36a244568b03ae3065ea9c6bd898e69660ed12" alt=""
- Non-Clustered
- INDEX SCAN + KEY LOOKUP
- N is number of rows which you want to search
- 2 is for root → page stesp in Non-Clustered
SELECT TOP 5 *
FROM TestAccess
ORDER BY name;
data:image/s3,"s3://crabby-images/162e8/162e8847b7f30780bb933a0b140b873c2642651d" alt=""
data:image/s3,"s3://crabby-images/edee8/edee8142cd8b7c27de747ec1bf088999e18358ae" alt=""
Seek Process
- Clustered
- INDEX SEEK
- root → page
- 2 steps
SELECT *
FROM TestAccess
WHERE id = 104;
data:image/s3,"s3://crabby-images/0afb3/0afb30fcff054567208bde08e977fa01898c2a4b" alt=""
data:image/s3,"s3://crabby-images/a4b9d/a4b9d24fca15eb1ec2180258e2136464fbbf92b6" alt=""
- Non-Clustered
- INDEX SEEK + KEY LOOKUP
- root → page → clustered root → page
- page has Key of clustered index
- 4 steps
SELECT *
FROM TestAccess
WHERE name = 'name5';
data:image/s3,"s3://crabby-images/c5d9f/c5d9fd7e1b468f8f99e12a8e9bffc96141a79aaf" alt=""
data:image/s3,"s3://crabby-images/b2a4b/b2a4b0770b74f34a5329f690cfb9e0d61ad5681d" alt=""
Bookmark Lookup
SET STATISTICS
- PROFILE
- show the actual order in which it was executed
SELECT *
INTO TestOrders
FROM Orders;
SELECT *
FROM TestOrders;
CREATE NONCLUSTERED INDEX Orders_Index01
ON TestOrders(CustomerID);
SELECT index_id, name
FROM sys.indexes
WHERE object_id = object_id('TestOrders');
DBCC IND('Northwind', 'TestOrders', 2);
SET STATISTICS TIME ON;
SET STATISTICS IO ON;
SET STATISTICS PROFILE ON;
WITH INDEX
- WITH INDEX
- force index usage
- Database choose whether index usage or not
- if it is more better index not usage, database can choose
SCAN
SELECT *
FROM TestOrders
WHERE CustomerID = 'QUICK';
data:image/s3,"s3://crabby-images/270bc/270bced4c71d30557bf8bf1bf62e772886b5b1e1" alt=""
data:image/s3,"s3://crabby-images/32968/32968a841623ac63b4bf327a974b2d00cd31c1bd" alt=""
data:image/s3,"s3://crabby-images/a6413/a641323badc3ff10c5502bd2aa7094c6d11d9768" alt=""
SELECT *
FROM TestOrders WITH(INDEX(Orders_Index01))
WHERE CustomerID = 'QUICK';
data:image/s3,"s3://crabby-images/bbfb4/bbfb45e5226d9e4ed3bb71a90fa94e394096131f" alt=""
data:image/s3,"s3://crabby-images/88f1a/88f1ac0b0eac494e096cd6d96d45e1ab1b01fccd" alt=""
data:image/s3,"s3://crabby-images/74469/74469203e58dc417076fe9d34b5c8411306a2df5" alt=""
Decrease Lookup
- Covered Index
- include all column to search in index
- but it can putsa load on the DML(CREATE, DELETE, UPDATE)
CREATE NONCLUSTERED INDEX Orders_Index02
ON TestOrders(CustomerID, ShipVia);
SELECT *
FROM TestOrders WITH(INDEX(Orders_Index02))
WHERE CustomerID = 'QUICK' AND ShipVia = 3;
data:image/s3,"s3://crabby-images/d2221/d2221fb1dcecccfef6e594fae161094901cf456d" alt=""
data:image/s3,"s3://crabby-images/ac440/ac440b5a8256d556895b3fd2db8eae40d3df1d5a" alt=""
data:image/s3,"s3://crabby-images/d6e29/d6e298a9c233cd266414cf816db77a5fecf9176a" alt=""
- INCLUDE
- give a hint to index by
INCLUDE
- give a hint to index by
CREATE NONCLUSTERED INDEX Orders_Index03
ON TestOrders(CustomerID) INCLUDE (ShipVia);
data:image/s3,"s3://crabby-images/93007/930072460697d7109ff35392f50bdfd7470dd350" alt=""
data:image/s3,"s3://crabby-images/3be78/3be787f9f9976f69da781093c8e6f9424e95b90a" alt=""
data:image/s3,"s3://crabby-images/e3af8/e3af8a161f3c19a9d7dae4c05c25e913c1e3dd7f" alt=""
- Clustered
- Only one column
- it can put a load if there is Non-Clustred
Index Column Order
Create dummy data
USE Northwind;
SELECT *
INTO TestOrders
FROM Orders;
DECLARE @i INT = 1;
DECLARE @emp INT;
SELECT @emp = MAX(EmployeeID)
FROM Orders;
SELECT *
FROM TestOrders;
WHILE(@i < 1000)
BEGIN
INSERT INTO TestOrders(CustomerID, EmployeeID, OrderDate)
SELECT CustomerID, @emp + @i, OrderDate
FROM Orders;
SET @i = @i + 1;
END
SELECT COUNT(*)
FROM TestOrders;
data:image/s3,"s3://crabby-images/b7cce/b7cce2fa124441f68cd8cba39f0f22091b1869f8" alt=""
Create Index
idx_emp_ord
creates index byEmployeeID
at first, and then byOrderDate
idx_ord_emp
creates index byOrderDate
at first, and then byEmployeeID
CREATE NONCLUSTERED INDEX idx_emp_ord
ON TestOrders(EmployeeID, OrderDate);
CREATE NONCLUSTERED INDEX idx_ord_emp
ON TestOrders(OrderDate, EmployeeID);
SET STATISTICS TIME ON;
SET STATISTICS IO ON;
Compare
- Case 1: no range in search
- there is same logiacal read time
- and both use index to search
SELECT *
FROM TestOrders WITH(INDEX(idx_emp_ord))
WHERE EmployeeID = 1 AND OrderDate = CONVERT(DATETIME, '19970101');
data:image/s3,"s3://crabby-images/001d3/001d37949ac7a63adfdc095f12ff396428b8143d" alt=""
data:image/s3,"s3://crabby-images/7f2da/7f2da93ab563cf464ceb7c50841fcc6e6ac1c7f6" alt=""
SELECT *
FROM TestOrders WITH(INDEX(idx_ord_emp))
WHERE EmployeeID = 1 AND OrderDate = CONVERT(DATETIME, '19970101');
data:image/s3,"s3://crabby-images/9b8b6/9b8b67f554c070bcd657003dd17378c8bb70aace" alt=""
data:image/s3,"s3://crabby-images/569f9/569f9609e4264c0433675324a49546e2698eae53" alt=""
- first, database searches the page which have a real data matching with condition
- and check the next row
- if the next row is not matched with condition, then return
- So,
idx_emp_ord
andidx_emp_ord
searched same count of row - this is because their search order did not affect to search
SELECT *
FROM TestOrders
ORDER BY EmployeeID, OrderDate;
data:image/s3,"s3://crabby-images/d3eda/d3eda83cb1308b74df99eed687e4864b87d4e661" alt=""
SELECT *
FROM TestOrders
ORDER BY OrderDate, EmployeeID;
data:image/s3,"s3://crabby-images/229f3/229f3fd97d260822595fae1d912f1f7aff949e8b" alt=""
- Case 2: range in search
- both use index also
- but both of logical read time are different
-- Same with next SELECT statement ---------------
SELECT *
FROM TestOrders WITH(INDEX(idx_emp_ord))
WHERE EmployeeID = 1 AND OrderDate >= '19970101' AND OrderDate <= '19970103';
--------------------------------------------------
SELECT *
FROM TestOrders WITH(INDEX(idx_emp_ord))
WHERE EmployeeID = 1 AND OrderDate BETWEEN '19960701' AND '19970103';
data:image/s3,"s3://crabby-images/51f3b/51f3ba869f00626ca71f7dbbbaa5bc6997c4e78e" alt=""
data:image/s3,"s3://crabby-images/277e5/277e599566ce57e41dd6006440274c890dc75c9d" alt=""
SELECT *
FROM TestOrders WITH(INDEX(idx_ord_emp))
WHERE EmployeeID = 1 AND OrderDate BETWEEN '19960701' AND '19970103';
data:image/s3,"s3://crabby-images/8345b/8345ba89b5fce6fc4909ef0382b2d63b1901f241" alt=""
data:image/s3,"s3://crabby-images/88802/88802baab775e95ac0e2263f49db4ee265fc344f" alt=""
- in
idx_emp_ord
, database searches the page which match with1
inEmployeeID
, and then match with between19970101
and19970103
inOrderDate
- and check next row
- it is ordered by
EmployeeID
first, so next row must have same or biggerEmployeeID
- So,
idx_emp_ord
searched only 3 rows
SELECT *
FROM TestOrders
ORDER BY EmployeeID, OrderDate;
data:image/s3,"s3://crabby-images/9f2b1/9f2b1be5379472a27af4998852922080b24f6179" alt=""
- in
idx_ord_emp
, database searches the page wich match with between19970101
and19970103
inOrderDate
, and then match with1
inEmployeeID
- and check next row
- it is ordered by
OrderDate
first, So if theOrderDate
is in between19970101
and19970103
, then database have to checkEmployeeID
in the rows - So
idx_ord_emp
searched more then 3 rows
SELECT *
FROM TestOrders
ORDER BY OrderDate, EmployeeID;
data:image/s3,"s3://crabby-images/87cfc/87cfc5bedff66f801493bf70e1ada8251d36ea6b" alt=""
IN-LIST
- IN()
- if between range is small, then change the range to
IN()
- if between range is small, then change the range to
SET STATISTICS PROFILE ON;
SELECT *
FROM TestOrders WITH(INDEX(idx_ord_emp))
WHERE EmployeeID = 1 AND OrderDate IN('19960101','19970102', '19970103');
data:image/s3,"s3://crabby-images/cbfba/cbfbaafd53ccc569691532e8844f0e66e8f5ced3" alt=""
SELECT *
FROM TestOrders WITH(INDEX(idx_emp_ord))
WHERE EmployeeID = 1 AND OrderDate IN('19960101','19970102', '19970103');
data:image/s3,"s3://crabby-images/17c41/17c417f052fb6d5367025bc2c768e95ba60486de" alt=""
Join
Hash Join
- Hash Join does not need to sort → The more data to Merge, The better to Hash
- not be infected by Index
- Hash is better than NL/Merge(depends on situation)
- But you have to concider the cost of hash table(process time become bigger → Index)
- Random Access X
- smaller data is good to make Hash Table
SELECT *
FROM TestOrders AS o
INNER JOIN TestCustomers AS c
ON o.CustomerID = c.CustomerID;
data:image/s3,"s3://crabby-images/06a27/06a27735f3b053853d5109b8c98a81df1e4dae8f" alt=""
- Hash Join in C#
using System;
using System.Collections.Generic;
namespace HashJoin
{
class Player
{
public int playerId;
}
class Salary
{
public int playerId;
}
class HashTable
{
int _bucketCount;
List<int>[] _buckets;
public HashTable(int bucketCount = 100)
{
_bucketCount = bucketCount;
_buckets = new List<int>[bucketCount];
for (int i = 0; i < bucketCount; i++)
_buckets[i] = new List<int>();
}
public void Add(int value)
{
int key = value % _bucketCount;
_buckets[key].Add(value);
}
public bool Find(int value)
{
int key = value % _bucketCount;
// _buckets[key].Contains(value);
foreach(int v in _buckets[key])
{
if (v == value)
return true;
}
return false;
}
}
class Program
{
static void Main(string[] args)
{
Random rand = new Random();
List<Player> players = new List<Player>();
for (int i = 0; i < 1000; i++)
{
if (rand.Next(0, 2) == 0)
continue;
players.Add(new Player() { playerId = i });
}
List<Salary> salaries = new List<Salary>();
for (int i = 0; i < 1000; i++)
{
if (rand.Next(0, 2) == 0)
continue;
salaries.Add(new Salary() { playerId = i });
}
// TEMP HashTable
/*Dictionary<int, Salary> hash = new Dictionary<int, Salary>();
foreach (Salary s in salaries)
hash.Add(s.playerId, s);
List<int> result = new List<int>();
foreach(Player p in players)
{
if (hash.ContainsKey(p.playerId))
result.Add(p.playerId);
}*/
HashTable hash = new HashTable();
foreach (Salary s in salaries)
hash.Add(s.playerId);
List<int> result = new List<int>();
foreach(Player p in players)
{
if (hash.Find(p.playerId))
result.Add(p.playerId);
}
}
}
}