빅데이터 시각화 0323

문제 1. 패키지 설치 방법 MD

1
2
library(ggplot2)
library(dplyr)
1
2
## 
## Attaching package: 'dplyr'
1
2
3
## The following objects are masked from 'package:stats':
##
## filter, lag
1
2
3
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union

문제 2. 질적 변수와 양적 병수의 차이에 대해 설명하시오

질적 변수는 어떠한 카테고리내에서 다양하게 모이는 변수이고, 양적 변수는 크기를 숫자로 나타내어 유의미한 가치를 가지는 변수입니다.

문제 3. 엑셀 데이터를 불러오세요

1
2
3
library(dplyr)
counties<-readxl::read_xlsx("1_day/docs/counties.xslx",sheet=1)
glimpse(counties)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
## Rows: 3,138
## Columns: 40
## $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", "10~
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabam~
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", ~
## $ region <chr> "South", "South", "South", "South", "South", "South~
## $ metro <chr> "Metro", "Metro", "Nonmetro", "Metro", "Metro", "No~
## $ population <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 1~
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 5627~
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 603~
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7~
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.~
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3,~
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0~
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0~
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0~
## $ citizens <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88~
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41~
## $ income_err <dbl> 2391, 1263, 2973, 3995, 3141, 5884, 1793, 925, 2949~
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21~
## $ income_per_cap_err <dbl> 1080, 711, 798, 1618, 708, 2055, 714, 489, 1366, 15~
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.~
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.~
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.~
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.~
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.~
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5~
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.~
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.~
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, ~
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0~
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1~
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1~
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1~
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.~
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401,~
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.~
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.~
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4~
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0~
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.~
## $ land_area <dbl> 594.44, 1589.78, 884.88, 622.58, 644.78, 622.81, 77~

문제 4. private_work, unemployment를 활용하여 산점도를 작성하세요.

1
2
3
4
5
6
7
library(dplyr)
counties<-readxl::read_xlsx("1_day/docs/counties.xslx",sheet=1)

library(ggplot2)
ggplot(counties,aes(x=private_work,y=unemployment))+geom_point(aes(colour=region))->g1

plot(g1)

문제 5. dplyr 함수를 활용하여, 아래 데이터를 요약하세요.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
## Rows: 3,138
## Columns: 40
## $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", "10~
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabam~
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", ~
## $ region <chr> "South", "South", "South", "South", "South", "South~
## $ metro <chr> "Metro", "Metro", "Nonmetro", "Metro", "Metro", "No~
## $ population <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 1~
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 5627~
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 603~
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7~
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.~
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3,~
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0~
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0~
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0~
## $ citizens <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88~
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41~
## $ income_err <dbl> 2391, 1263, 2973, 3995, 3141, 5884, 1793, 925, 2949~
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21~
## $ income_per_cap_err <dbl> 1080, 711, 798, 1618, 708, 2055, 714, 489, 1366, 15~
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.~
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.~
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.~
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.~
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.~
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5~
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.~
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.~
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, ~
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0~
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1~
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1~
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1~
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.~
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401,~
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.~
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.~
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4~
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0~
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.~
## $ land_area <dbl> 594.44, 1589.78, 884.88, 622.58, 644.78, 622.81, 77~
1
## `summarise()` has grouped output by 'region'. You can override using the `.groups` argument.
1
2
3
4
5
6
7
## Rows: 50
## Columns: 4
## Groups: region [4]
## $ region <chr> "North Central", "North Central", "North Central", "North Ce~
## $ state <chr> "Illinois", "Indiana", "Iowa", "Kansas", "Michigan", "Minnes~
## $ sum_men <dbl> 6316899, 3235263, 1534595, 1439862, 4861973, 2692166, 296400~
## $ sum_women <dbl> 6556862, 3333382, 1558931, 1453125, 5038598, 2727005, 308144~

문제 6. final_df를 기준으로 막대 그래프를 그립니다.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
## Rows: 3,138
## Columns: 40
## $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", "10~
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabam~
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", ~
## $ region <chr> "South", "South", "South", "South", "South", "South~
## $ metro <chr> "Metro", "Metro", "Nonmetro", "Metro", "Metro", "No~
## $ population <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 1~
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 5627~
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 603~
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7~
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.~
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3,~
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0~
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0~
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0~
## $ citizens <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88~
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41~
## $ income_err <dbl> 2391, 1263, 2973, 3995, 3141, 5884, 1793, 925, 2949~
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21~
## $ income_per_cap_err <dbl> 1080, 711, 798, 1618, 708, 2055, 714, 489, 1366, 15~
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.~
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.~
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.~
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.~
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.~
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5~
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.~
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.~
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, ~
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0~
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1~
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1~
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1~
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.~
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401,~
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.~
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.~
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4~
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0~
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.~
## $ land_area <dbl> 594.44, 1589.78, 884.88, 622.58, 644.78, 622.81, 77~
1
## `summarise()` has grouped output by 'region'. You can override using the `.groups` argument.
1
2
3
4
5
6
## Rows: 12
## Columns: 3
## Groups: region [1]
## $ region <chr> "North Central", "North Central", "North Central", "North Cent~
## $ state <chr> "Illinois", "Indiana", "Iowa", "Kansas", "Michigan", "Minnesot~
## $ sum_pop <dbl> 12873761, 6568645, 3093526, 2892987, 9900571, 5419171, 6045448~

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Author

이현정

Posted on

2021-03-23

Updated on

2021-04-06

Licensed under

You need to set install_url to use ShareThis. Please set it in _config.yml.
You forgot to set the business or currency_code for Paypal. Please set it in _config.yml.

Comments

You forgot to set the shortname for Disqus. Please set it in _config.yml.
You need to set client_id and slot_id to show this AD unit. Please set it in _config.yml.