GROUP BY
With aggregate functions, the aggregate is calculated once on each row of unique values:
CREATE TABLE t (c0 CHAR(1), c1 INT);
INSERT INTO t VALUES ('a', 1), ('a', 2), ('b', 3), ('b', 3);
SELECT c0, SUM(c1) FROM t GROUP BY c0 ORDER BY c0;
SELECT c0, COUNT(c1) FROM t GROUP BY c0 ORDER BY c0;
DROP TABLE t;
Output:
c0 SUM(c1)
a 3
b 6
c0 COUNT(c1)
a 2
b 2
GROUP BY and JOIN
CREATE TABLE users (id INT PRIMARY KEY, age INT);
CREATE TABLE posts (
id INT PRIMARY KEY,
userid INT,
FOREIGN KEY (userid) REFERENCES users(id)
);
INSERT INTO users VALUES (1, 20), (2, 20);
INSERT INTO posts VALUES (1, 1), (2, 1), (3, 2);
# How many posts each user has. Also show user age.
SELECT
users.id,
users.age,
COUNT(*)
FROM
users
INNER JOIN
posts
ON
users.id = posts.userid
GROUP BY
users.id
;
DROP TABLE posts;
DROP TABLE users;
Output:
id age COUNT(*)
1 20 2
2 20 1
T-SQL forces you to put users.age
and any non-aggregate column under GROUP BY
as well like:
GROUP BY
users.id,
users.age,
GROUP BY multiple columns
http://stackoverflow.com/questions/2421388/using-group-by-on-multiple-columns
Groups by distinct tuples.
Ignore GROUP BY column
All selected columns must be either grouped by, or applied an aggregate function.
How to avoid repeating columns: http://stackoverflow.com/questions/5364429/group-by-ignoring-an-attribute
GROUP BY without aggregate
Without an aggregate function and for a single column, works exactly like DISTINCT
.
But don’t rely on that and prefer DISTINCT
instead, since GROUP BY
is designed to work with aggregates, and may have subtly different semantics: http://stackoverflow.com/questions/164319/is-there-any-difference-between-group-by-and-distinct
In particular, T-SQL raises an error if you try to do that.
CREATE TABLE t (c0 CHAR(1), c1 INT);
INSERT INTO t VALUES ('a', 1), ('a', 2), ('b', 3), ('b', 3);
SELECT * FROM t GROUP BY c0 ORDER BY c0,c1;
SELECT * FROM t GROUP BY c1 ORDER BY c0,c1;
#TODO what happens on this one? same as above?
SELECT * FROM t GROUP BY c0,c1 ORDER BY c0,c1;
DROP TABLE t;
Output:
c0 c1
a 1
b 3
c0 c1
a 1
a 2
b 3
c0 c1
a 1
a 2
b 3
Applications
Net upvotes minus downvotes
CREATE TABLE votes (
article_id INT,
type INT
);
INSERT INTO votes VALUES
(1, 0), (1, 1), (2, 0), (3, 1);
# Most votes ignoring type.
SELECT article_id, COUNT(article_id) AS count
FROM votes
GROUP BY article_id
ORDER BY count DESC;
# Most upvotes.
SELECT article_id, COUNT(article_id) AS count
FROM votes
WHERE type = 0
GROUP BY article_id
ORDER BY count DESC;
# Upvotes minus downvotes.
SELECT
article_id,
SUM(
CASE type
WHEN 0 THEN 1
WHEN 1 THEN -1
END
) AS count
FROM votes
GROUP BY article_id
ORDER BY count DESC;
DROP TABLE votes;