<- 4; y <- 3
x <- ggplot(data = diamonds) +
bar geom_bar(mapping = aes(x = cut, fill = cut),
show.legend = FALSE, width = 1) +
theme(aspect.ratio = 1) +
labs(x = NULL, y = NULL)
+ coord_flip() bar
+ coord_polar() bar
MATH/COSC 3570 Spring 2025
<- 4; y <- 3
x <- ggplot(data = diamonds) +
bar geom_bar(mapping = aes(x = cut, fill = cut),
show.legend = FALSE, width = 1) +
theme(aspect.ratio = 1) +
labs(x = NULL, y = NULL)
+ coord_flip() bar
+ coord_polar() bar
Briefly describe how we produce a pdf.
Hello everyone, I am Cheng-Han Yu, an assistant professor at Marquette University. I love data science!
My main research interests include
My favorite quote is
All models are wrong, but some are useful. George Box
Here I write a simple math equation \(\frac{-b \pm \sqrt{b^2 - 4ac}}{2a}\).
# include image
::include_graphics("https://raw.githubusercontent.com/rstudio/hex-stickers/master/PNG/ggplot2.png") knitr
# include plot
plot(x = mtcars$disp, y = mtcars$mpg)
# show dataset `mtcars`
::kable(mtcars, caption = "A knitr kable table of mtcars data set") knitr
mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|
Mazda RX4 | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
Mazda RX4 Wag | 21.0 | 6 | 160.0 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
Datsun 710 | 22.8 | 4 | 108.0 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | 1 |
Hornet 4 Drive | 21.4 | 6 | 258.0 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
Hornet Sportabout | 18.7 | 8 | 360.0 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
Valiant | 18.1 | 6 | 225.0 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 3 | 1 |
Duster 360 | 14.3 | 8 | 360.0 | 245 | 3.21 | 3.570 | 15.84 | 0 | 0 | 3 | 4 |
Merc 240D | 24.4 | 4 | 146.7 | 62 | 3.69 | 3.190 | 20.00 | 1 | 0 | 4 | 2 |
Merc 230 | 22.8 | 4 | 140.8 | 95 | 3.92 | 3.150 | 22.90 | 1 | 0 | 4 | 2 |
Merc 280 | 19.2 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.30 | 1 | 0 | 4 | 4 |
Merc 280C | 17.8 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.90 | 1 | 0 | 4 | 4 |
Merc 450SE | 16.4 | 8 | 275.8 | 180 | 3.07 | 4.070 | 17.40 | 0 | 0 | 3 | 3 |
Merc 450SL | 17.3 | 8 | 275.8 | 180 | 3.07 | 3.730 | 17.60 | 0 | 0 | 3 | 3 |
Merc 450SLC | 15.2 | 8 | 275.8 | 180 | 3.07 | 3.780 | 18.00 | 0 | 0 | 3 | 3 |
Cadillac Fleetwood | 10.4 | 8 | 472.0 | 205 | 2.93 | 5.250 | 17.98 | 0 | 0 | 3 | 4 |
Lincoln Continental | 10.4 | 8 | 460.0 | 215 | 3.00 | 5.424 | 17.82 | 0 | 0 | 3 | 4 |
Chrysler Imperial | 14.7 | 8 | 440.0 | 230 | 3.23 | 5.345 | 17.42 | 0 | 0 | 3 | 4 |
Fiat 128 | 32.4 | 4 | 78.7 | 66 | 4.08 | 2.200 | 19.47 | 1 | 1 | 4 | 1 |
Honda Civic | 30.4 | 4 | 75.7 | 52 | 4.93 | 1.615 | 18.52 | 1 | 1 | 4 | 2 |
Toyota Corolla | 33.9 | 4 | 71.1 | 65 | 4.22 | 1.835 | 19.90 | 1 | 1 | 4 | 1 |
Toyota Corona | 21.5 | 4 | 120.1 | 97 | 3.70 | 2.465 | 20.01 | 1 | 0 | 3 | 1 |
Dodge Challenger | 15.5 | 8 | 318.0 | 150 | 2.76 | 3.520 | 16.87 | 0 | 0 | 3 | 2 |
AMC Javelin | 15.2 | 8 | 304.0 | 150 | 3.15 | 3.435 | 17.30 | 0 | 0 | 3 | 2 |
Camaro Z28 | 13.3 | 8 | 350.0 | 245 | 3.73 | 3.840 | 15.41 | 0 | 0 | 3 | 4 |
Pontiac Firebird | 19.2 | 8 | 400.0 | 175 | 3.08 | 3.845 | 17.05 | 0 | 0 | 3 | 2 |
Fiat X1-9 | 27.3 | 4 | 79.0 | 66 | 4.08 | 1.935 | 18.90 | 1 | 1 | 4 | 1 |
Porsche 914-2 | 26.0 | 4 | 120.3 | 91 | 4.43 | 2.140 | 16.70 | 0 | 1 | 5 | 2 |
Lotus Europa | 30.4 | 4 | 95.1 | 113 | 3.77 | 1.513 | 16.90 | 1 | 1 | 5 | 2 |
Ford Pantera L | 15.8 | 8 | 351.0 | 264 | 4.22 | 3.170 | 14.50 | 0 | 1 | 5 | 4 |
Ferrari Dino | 19.7 | 6 | 145.0 | 175 | 3.62 | 2.770 | 15.50 | 0 | 1 | 5 | 6 |
Maserati Bora | 15.0 | 8 | 301.0 | 335 | 3.54 | 3.570 | 14.60 | 0 | 1 | 5 | 8 |
Volvo 142E | 21.4 | 4 | 121.0 | 109 | 4.11 | 2.780 | 18.60 | 1 | 1 | 4 | 2 |
There are 11 variables in the mtcars
data set.
Answer to the questions.
= 5 radius
The radius of the circle is {python} print(radius)
<- c(3, 8, 4, 5)
v1 <- factor(c("bad", "neutral", "good"))
fac <- list(idx = 1:3,
x_lst "a",
c(TRUE, FALSE))
<- matrix(data = 1:6,
mat nrow = 3,
ncol = 2)
<- data.frame(age = c(19, 21, 40),
df gender = c("m","f", "m"))
<- c(type = typeof(v1), class = class(v1))
vec <- c(type = typeof(fac), class = class(fac))
fac <- c(type = typeof(x_lst), class = class(x_lst))
lst <- c(type = typeof(mat), class = class(mat))
mat <- c(type = typeof(df), class = class(df))
df list(vector = vec,
factor = fac,
list = lst,
matrix = mat,
dataframe = df)
$vector
type class
"double" "numeric"
$factor
type class
"integer" "factor"
$list
type class
"list" "list"
$matrix
type class1 class2
"integer" "matrix" "array"
$dataframe
type class
"list" "data.frame"
<- list(idx = 1:3,
x_lst word = "a",
bool = c(TRUE, FALSE))
= [[1, 2, 3], "a", [True, False]]
py_lst py_lst
[[1, 2, 3], 'a', [True, False]]
= {"idx": [1, 2, 3], "word": "a", "bool": [True, False]}
py_dic py_dic
{'idx': [1, 2, 3], 'word': 'a', 'bool': [True, False]}
plot(mtcars$mpg, mtcars$wt,
col = 4, pch = 8, cex = 2,
xlab = "MPG", ylab = "Wt. (1000 lbs)",
main = "MPG vs. Weight")
hist(mtcars$qsec, breaks = 20, border = "#FFCC00",
col = 2, main = "Histogram of 1/4 mile time")
boxplot(mpg ~ gear,
data = mtcars,
col = 2:4,
las = 1,
horizontal = TRUE,
xlab = "Miles per gallon",
ylab = "Number of forward gears")
import pandas as pd
import matplotlib.pyplot as plt
= pd.read_csv('./data/mtcars.csv')
mtcars mtcars
mpg cyl disp hp drat wt qsec vs am gear carb
0 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
5 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
7 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
8 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
9 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
10 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
11 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
12 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
13 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
14 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
15 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
16 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
17 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
18 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
19 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
20 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
21 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
22 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
23 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
24 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
25 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
26 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
27 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
28 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
29 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
30 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
31 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
= mtcars.mpg,
plt.scatter(x = mtcars.wt,
y = "r")
color "Miles per gallon")
plt.xlabel("Weight")
plt.ylabel("Scatter plot")
plt.title( plt.show()
plt.clf()
plt.hist(mtcars.qsec, = 19,
bins ="#003366",
color="#FFCC00")
edgecolor"1/4 mile time")
plt.xlabel("Histogram of 1/4 mile time")
plt.title( plt.show()
<- data.frame(abc = 1:2,
df xyz = c("a", "b"))
# list method
$x df
[1] "a" "b"
2]] df[[
[1] "a" "b"
"xyz"] df[
xyz
1 a
2 b
c("abc", "xyz")] df[
abc xyz
1 1 a
2 2 b
# matrix method
2] df[,
[1] "a" "b"
"xyz"] df[,
[1] "a" "b"
c("abc", "xyz")] df[,
abc xyz
1 1 a
2 2 b
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ lubridate 1.9.3 ✔ tibble 3.2.1
✔ purrr 1.0.2 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
<- tibble(abc = 1:2,
tib xyz = c("a", "b"))
# list method
$x tib
Warning: Unknown or uninitialised column: `x`.
NULL
2]] tib[[
[1] "a" "b"
"xyz"] tib[
# A tibble: 2 × 1
xyz
<chr>
1 a
2 b
c("abc", "xyz")] tib[
# A tibble: 2 × 2
abc xyz
<int> <chr>
1 1 a
2 2 b
# matrix method
2] tib[,
# A tibble: 2 × 1
xyz
<chr>
1 a
2 b
"xyz"] tib[,
# A tibble: 2 × 1
xyz
<chr>
1 a
2 b
c("abc", "xyz")] tib[,
# A tibble: 2 × 2
abc xyz
<int> <chr>
1 1 a
2 2 b
Explain their differences.
With data.frames,
$
operator will match any column name that starts with the name following it. Since there is a column named xyz, the expression df$x
will be expanded to df$xyz.
This behavior of the $
operator saves a few keystrokes, but it can result in accidentally using a different column than you thought you were using.[
the type of object that is returned differs on the number of columns. If it is one column, it won’t return a data.frame, but instead will return a vector. With more than one column, then it will return a data.frame. This is fine if you know what you are passing in, but suppose you did df[ , vars]
where vars
was a variable. Then what that code does depends on length(vars)
and you’d have to write code to account for those situations or risk bugs.For tibbles,
When using the matrix subsetting method, a tibble always return a tibble.
When using $ to grab an element, tibbles never do partial matching.
[]
always returns another tibble, regardless of list or matrix subsetting method.
$
and[[]]
return a vector.
Tibbles never do partial matching and name “x” cannot be recognized.
What does tibble::enframe()
do? Try enframe(c(a = 1, b = 2, c = 3))
. Check ?enframe
for more details.
The function tibble::enframe()
converts named vectors to a data frame with names and values
|> tail(n = 12) |> summary() iris
Sepal.Length Sepal.Width Petal.Length Petal.Width
Min. :5.800 Min. :2.500 Min. :4.800 Min. :1.800
1st Qu.:6.150 1st Qu.:3.000 1st Qu.:5.100 1st Qu.:1.900
Median :6.600 Median :3.050 Median :5.200 Median :2.200
Mean :6.450 Mean :3.033 Mean :5.292 Mean :2.133
3rd Qu.:6.725 3rd Qu.:3.125 3rd Qu.:5.450 3rd Qu.:2.300
Max. :6.900 Max. :3.400 Max. :5.900 Max. :2.500
Species
setosa : 0
versicolor: 0
virginica :12
tibble(x = 1:5, y = 5:1, z = LETTERS[1:5])
# A tibble: 5 × 3
x y z
<int> <int> <chr>
1 1 5 A
2 2 4 B
3 3 3 C
4 4 2 D
5 5 1 E
import numpy as np
import pandas as pd
import string
list(string.ascii_uppercase)
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
= {'x':np.arange(1, 6), 'y': np.arange(5, 0, -1), 'z':list(string.ascii_uppercase)[0:5]}
dic pd.DataFrame(dic)
x y z
0 1 5 A
1 2 4 B
2 3 3 C
3 4 2 D
4 5 1 E
library(tidyverse)
# ssa <- read_csv(file = "./data/ssa-death-probability.csv")
# ssa_male <- ssa[ssa$Sex == "Male",]
# ssa_female <- ssa[ssa$Sex == "Female",]
<- readr::read_csv("./data/ssa_male_prob.csv") ssa_male
Rows: 120 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Sex
dbl (4): Age, DeathProb, NumberOfLives, LifeExp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- readr::read_rds("./data/ssa_female_prob.Rds")
ssa_female plot(x = ssa_female$Age, y = ssa_female$LifeExp,
type = "l", col = 2, lwd = 3,
xlab = "Age", ylab = "Life Exp",
main = "Age vs. Life Exp by Gender")
lines(ssa_male$Age, ssa_male$LifeExp, col = 4, lwd = 3)
<- read_csv("./data/penguins.csv") penguins
Rows: 344 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): species, island, sex
dbl (5): bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, year
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
|>
penguins ggplot(mapping = aes(x = bill_depth_mm,
y = bill_length_mm,
colour = species)) +
geom_point() +
labs(title = "Bill depth and length",
subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",
x = "Bill depth (mm)", y = "Bill length (mm)",
colour = "Species",
caption = "Source: Palmer Station LTER / palmerpenguins package") +
scale_colour_viridis_d()
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).
|> ggplot(mapping = aes(x = displ, y = cty, color = drv, shape = fl)) +
mpg geom_point(size = 3, alpha = 0.8) +
facet_grid(drv ~ fl) +
guides(color = "none")
<- read_csv("./data/penguins.csv") penguins
Rows: 344 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): species, island, sex
dbl (5): bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, year
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
|> ggplot(aes(x = species, fill = species)) +
penguins geom_bar() +
labs(x = "Species of Penguins",
title = "Species Counts in Penguins Data")
|> ggplot(aes(x = bill_length_mm,
penguins fill = species)) +
geom_histogram() +
labs(x = "Bill Length (mm)",
y = "Frequency",
title = "Penguins Bill Length by Species") +
facet_wrap(~ species, nrow = 1) +
theme(legend.position = "none")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_bin()`).
library(plotly)
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
<- readr::read_csv("./data/loans.csv") loans
Rows: 10000 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): grade, homeownership
dbl (3): loan_amount, interest_rate, debt_to_income
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- plot_ly(loans, x = ~interest_rate, alpha = 0.5)
p |> add_boxplot(y = ~grade, color = ~grade) p
# x = interest_rate, y = grade won't work
<- loans %>% ggplot(aes(x = grade, y = interest_rate, color = grade)) +
gg geom_boxplot() + theme_minimal() + coord_flip()
ggplotly(gg)
<- read.csv("./data/murders.csv")
murders <- murders |>
(my_states mutate(rate = total / population * 100000) |>
filter(region %in% c("West", "Northeast"), rate < 1) |>
select(state, region, rate))
state region rate
1 Hawaii West 0.5145920
2 Idaho West 0.7655102
3 Maine Northeast 0.8280881
4 New Hampshire Northeast 0.3798036
5 Oregon West 0.9396843
6 Utah West 0.7959810
7 Vermont Northeast 0.3196211
8 Wyoming West 0.8871131
|>
my_states group_by(region) |>
summarize(avg = mean(rate), std_dev = sd(rate)) |>
arrange(desc(avg))
# A tibble: 2 × 3
region avg std_dev
<chr> <dbl> <dbl>
1 West 0.781 0.164
2 Northeast 0.509 0.278
<- read_csv("https://www.jaredlander.com/data/DiamondColors.csv") diamond_color
Rows: 10 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Color, Description, Details
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- left_join(diamonds, diamond_color, by = c('color' = 'Color')) |>
joined_df select(carat, color, price, Description, Details)
joined_df
# A tibble: 53,940 × 5
carat color price Description Details
<dbl> <chr> <int> <chr> <chr>
1 0.23 E 326 Colorless Minute traces of color
2 0.21 E 326 Colorless Minute traces of color
3 0.23 E 327 Colorless Minute traces of color
4 0.29 I 334 Near Colorless Slightly detectable color
5 0.31 J 335 Near Colorless Slightly detectable color
6 0.24 J 336 Near Colorless Slightly detectable color
7 0.24 I 336 Near Colorless Slightly detectable color
8 0.26 H 337 Near Colorless Color is dificult to detect
9 0.22 E 337 Colorless Minute traces of color
10 0.23 H 338 Near Colorless Color is dificult to detect
# ℹ 53,930 more rows
|> ggplot(aes(x = color)) +
joined_df geom_bar()
|> count(color, sort = TRUE) joined_df
# A tibble: 7 × 2
color n
<chr> <int>
1 G 11292
2 E 9797
3 F 9542
4 H 8304
5 D 6775
6 I 5422
7 J 2808