R -- 度数分布表と度数分布図の作成

度数分布表と度数分布図の作成　　　　　Last modified: Dec 12, 2012
目的

データフレーム上の複数の変数を指定して，度数分布表と度数分布図を作成する。
数値変数かカテゴリー変数かを識別して，適切な度数分布表とヒストグラムまたは棒グラフを作成する。

使用法

frequency(i, df, latex=TRUE, captions=NULL, labels=NULL, output="", output="", encoding=getOption("encoding"),
          plot="", type=c("pdf", "png", "jpeg", "bmp", "tiff"),
          width=500, height=375, xlab=NULL, ylab="度数", main="")

引数

i        データフレーム上で，度数分布をする変数が入っている，列の番号または変数名（ベクトルとして複数指定可）
df       読み込んだデータフレームの名前
latex    LaTeX ソースを出力する。Word 用なら，latex=FALSE にする
captions latex=TRUE の場合，各表の表題を文字列ベクトルとして指定することができる（デフォルトではあり合わせの表題を付ける）。
labels   latex=TRUE の場合，各表の label を文字列ベクトルとして指定することができる（デフォルトでは付けない）。
plot     ヒストグラムまたは棒グラフを出力するファイルの名前
         複数のグラフが出力されるときは実際には「名前001.pdf]，「名前002.pdf]のように連番のファイル名になる
         デフォルトでは空文字列（""）で，そのときはグラフを出力しない
type     画像ファイルのフォーマットを "pdf", "png", "jpeg", "bmp", "tiff" から指定する。デフォルトは "pdf"
         Macintoshの場合，"png", "jpeg", "bmp", "tiff" を選ぶ場合には，事前に X11 を起動しておかなければならない。
width    画像の横幅のピクセル数（デフォルトは 500 ピクセル）
height   画像の高さのピクセル数（デフォルトは 375 ピクセル）
xlab     横軸のラベル（デフォルトはデータフレームでの変数名）
ylab     縦軸のラベル（デフォルトは「度数」）
main     図のタイトル（デフォルトは何も付けない）
output   出力コネクション
         デフルトはコンソールに出力
encoding ファイルに出力するときのエンコーディング（デフォルトは OS による）

ソース

インストールは，以下の 1 行をコピーし，R コンソールにペーストする
source("http://aoki2.si.gunma-u.ac.jp/R/src/frequency.R", encoding="euc-jp")

######
#
# 分析対象変数の度数分布表を作成し，変数が factor のときには棒グラフ，数値変数の場合にはヒストグラムを描く
#
######

frequency <- function(i,                                                     # 分析対象変数のあるデータフレームの列番号または変数名ベクトル
                      df,                                                       # データフレーム
                      latex=TRUE,                                               # LaTeX 形式で度数分布表を出力する（デフォルトは LaTeX 形式）
                      captions=NULL,                                            # latex=TRUE のときに，各表の表題を表す文字列ベクトルを指定できる（NULL のときはデフォルトの表題）
                      labels=NULL,                                              # latex=TRUE のときに，各表の label を表す文字列ベクトルを指定できる（NULL のときは付けない）
                      output="",                                                # ファイルに出力するときはファイル名（デフォルトはコンソールに出力）
                      encoding=getOption("encoding"),                           # ファイルに出力するときのエンコーディング（デフォルトは OS による）
                      plot="",                                                  # 棒グラフ・ヒストグラムを描き出すファイル名（デフォルトは Quarts デバイスに出力）
                      type=c("pdf", "png", "jpeg", "bmp", "tiff"),              # 画像フォーマット（plot と併せてファイル名の拡張子として使う）
                      width=500,                                                # 画像の横幅のピクセル数（デフォルトは500ピクセル）
                      height=375,                                               # 画像の高さのピクセル数（デフォルトは375ピクセル）
                      xlab=NULL,                                                # 横軸のラベル（デフォルトは対象変数名）。何も描かないときは空文字列を指定する
                      ylab="度数",                                              # 縦軸のラベル（デフォルトは「度数」）。何も描かないときは空文字列を指定する
                      main="")                                                  # グラフのメインタイトル（デフォルトはメインタイトルを付けない）
{

        getNum <- function(str, df) {                                                # 変数名から列番号を得る
                names <- colnames(df)
                seq_along(names)[names %in% str]
        }

        if (output != "") {                                                     # 結果をファイルに出力する場合の処理
                output <- file(output, open="w", encoding=encoding)
        }

                                                                                # グラフをファイルに出力する場合の処理
        if (plot != "") {                                                       # グラフをファイルに出力するとき plot はファイル名（拡張子を除く）
                type <- match.arg(type)                                              # 画像ファイルの形式
                if (type == "pdf") {                                            # pdf なら，一つ一つの画像を別々のファイルに出力するために onefile = FALSE にする
                        pdf(sprintf("%s%%03i.pdf", plot), onefile=FALSE, width=width/72, height=height/72)
                                                                                # pdf は，画像の大きさの指定がインチ単位なので 72dot/inch で換算
                }
                else if (type == "png") {
                        png(sprintf("%s%%03i.%s", plot, type), width=width, height=height)
                }
                else if (type == "bmp") {
                        bmp(sprintf("%s%%03i.%s", plot, type), width=width, height=height)
                }
                else if (type == "jpeg") {
                        jpeg(sprintf("%s%%03i.%s", plot, type), width=width, height=height)
                }
                else { # type == "tiff"
                        tiff(sprintf("%s%%03i.%s", plot, type), width=width, height=height)
                }
        }

        if (is.character(i[1])) {
                i <- getNum(i, df)
        }
# それぞれの分析対象変数について
        index <- 0
        for (i1 in i) {                                                         # i には，分析対象とする変数のデータフレーム上での列位置を示す番号がベクトルとして入っている
                x <- subset(df[,i1], !is.na(df[,i1]))                                # 欠損値を除いたデータを対象とする
                index <- index+1
                v.name <- colnames(df)[i1]                                   # 変数の名前を取り出す
                xlab2 = if (is.null(xlab)) v.name else xlab                     # x 軸のラベル。デフォルト（NULL）なら集計対象とする変数名。描かないなら空文字列
                
# 変数が factor のとき

                if (is.factor(x)) {                                             # 集計対象変数が factor なら，度数と相対度数だけを求め，棒グラフ（barplot）を描く
                        count <- table(x)                                    # table で，度数分布を求める
                        n <- sum(count)                                              # NA を除く有効ケース数
                        freq <- count/n*100                                  # 相対度数（%）を求める
                        name <- names(freq)                                  # 各カテゴリーの名前
                        ln <- length(freq)                                   # カテゴリー数
                        if (latex) {                                                            # LaTeX 形式で度数分布表を出力する
                                cat("\n\\begin{table}[htbp]\n", file=output)                    # \begin{table}[htbp]
                                if (is.null(captions)) {
                                        cat(sprintf("\\caption{%sの度数分布}\n", v.name), file=output)  # \caption{○○の度数分布}
                                }
                                else {
                                        cat(sprintf("\\caption{%s}\n", captions[index]), file=output)   # \caption{○○○○}
                                }
                                if (!is.null(labels)) {
                                        cat(sprintf("\\label{%s}\n", labels[index]), file=output)       # \labels{○○○○}
                                }
                                cat("\\centering\n", file=output)                               # \centering
                                cat("\\begin{tabular}{lrr} \\hline\n", file=output)             # \begin{tabular}{lrr} \hline
                                cat("項目\t&\t度数\t&\t相対度数\\\\ \\hline\n", file=output)      # 項目 & 度数 & 相対度数 \\ \hline
                                for (j in 1:ln) {                                               # 項目の数だけ繰り返す
                                        cat(sprintf("%s\t&\t%i\t&\t%.1f\\\\%s\n",         # ○○ & ○○ & ○○.○ \\
                                            name[j], count[j], freq[j],                         # ○○ & ○○ & ○○.○ \\
                                            if (j == ln) "\\hline" else ""), file=output)       # ○○ & ○○ & ○○.○ \\ \hline
                                }
                                cat(sprintf("合計\t&\t%i\t&\t100.0\\\\ \\hline\n", n),            # 合計 & ○○ & ○○.○ \\ \hline
                                    file=output)
                                cat("\\end{tabular}\n", file=output)                            # \end{tabular}
                                cat("\\end{table}\n", file=output)                              # \end{table}
                        }
                        else {                                                                  # tab で区切って出力する
                                cat("\n表　", v.name, "の度数分布\n\n", sep="", file=output)    # 表　○○の度数分布
                                cat("項目", "度数", "相対度数\n", sep="\t", file=output)        # 項目  度数    相対度数
                                for (j in 1:ln) {                                               # 項目の数だけ繰り返す
                                        cat(name[j], count[j], sprintf("%.1f\n", freq[j]),      # ○○  ○○    ○○.○
                                            sep="\t", file=output)                              # ○○  ○○    ○○.○
                                }                                                               # ○○  ○○    ○○.○
                                cat("合計", n, sprintf("%.1f\n", 100), sep="\t", file=output)   # 合計  ○○    ○○.○
                        }
                        barplot(count, xlab=xlab2, ylab=ylab)                                   # 棒グラフ（barplot）を描く
                }

# 数値変数のとき

                else {                                                          # 集計対象変数が factor でない（間隔尺度・比尺度）なら，累積度数も求め，ヒストグラムを描く
                        options(warn=-1)                                        # 次の行のような hist の使い方（plot=FALSE）で，R2.4.0 が過剰な warning を出すので対策を取った（R2.4.1 からは不要）
                        ans <- hist(x, right=FALSE, plot=FALSE)                      # 階級，度数などの hist オブジェクトを得る。級限界値は，左を含み，右を含まない
                        ln <- length(ans$breaks)                             # breaks の個数（階級数より 1 だけ少ない）
                        if (ans$breaks[ln] == max(x)) {                         # もし breaks の一番大きい値が，データ中の最大値なら，その値は右側級限界に含まれてしまっている
                                ans <- hist(x, breaks=seq(ans$breaks[1], by=diff(ans$breaks)[1], length=ln+1), right=FALSE, plot=FALSE)
                                                                                # 一番大きい階級の上に階級数を一つ増やして度数分布を取り直す
                        }
                        options(warn=0)                                         # warning のレベルを元に戻す
                        count <- ans$counts                                  # hist オブジェクトから，度数を取り出す
                        ccount <- cumsum(count)                                      # 累積度数を作る
                        n <- sum(count)                                              # サンプルサイズ
                        freq <- count/n*100                                  # 相対度数（%）
                        cfreq <- ccount/n*100                                        # 累積相対度数（%）
                        ln <- length(freq)                                   # 階級数
                        name <- ans$breaks[1:ln]                             # 階級の名前は，級限界値とする
                        fraction <- 0                                                # 階級表示に必要な小数点以下の桁数
                        for (j in 1:ln) {
                                char.vec <- unlist(strsplit(as.character(name[j]), "\\."))
                                if (length(char.vec) == 2) {
                                        fraction <- max(fraction, nchar(char.vec[2]))
                                }
                        }
                        if (fraction == 0) {
                                fmt <- "%s～"
                        }
                        else {
                                fmt <- sprintf("%%.%sf～", fraction)
                        }
                        if (latex) {                                                            # LaTeX 形式で度数分布表を出力する
                                cat("\n\\begin{table}[htbp]\n", file=output)                    # \begin{table}[htbp]
                                if (is.null(captions)) {
                                        cat(sprintf("\\caption{%sの度数分布}\n", v.name), file=output)  # \caption{○○の度数分布}
                                }
                                else {
                                        cat(sprintf("\\caption{%s}\n", captions[index]), file=output)   # \caption{○○○○}
                                }
                                if (!is.null(labels)) {
                                        cat(sprintf("\\label{%s}\n", labels[index]), file=output)       # \labels{○○○○}
                                }
                                cat("\\centering\n", file=output)                               # \centering
                                cat("\\begin{tabular}{lrrrr} \\hline\n", file=output)           # \begin{tabular}{lrrrr} \hline
                                cat("階級\t&\t度数\t&\t相対度数\t&\t累積度数\t&\t累積相対度数\\\\ \\hline\n", file=output)
                                                                                                # 階級 & 度数 & 相対度数 & 累積度数 & 累積相対度数 \\ \hline
                                fmt <- paste(fmt, "\t&\t%i\t&\t%.1f\t&\t%i\t&\t%.1f\\\\%s\n", sep="")
                                for (j in 1:ln) {                                               # 階級の数だけ繰り返す
                                        cat(sprintf(fmt,
                                            name[j], count[j], freq[j], ccount[j], cfreq[j],    # ○○ & ○○ & ○○.○ & ○○ & ○○.○ \\
                                            if (j == ln) " \\hline" else ""), file=output)      # ○○ & ○○ & ○○.○ & ○○ & ○○.○ \\ \hline
                                }
                                cat(sprintf("合計\t&\t%i\t&\t100.0\\\\ \\hline\n", n),            # 合計 & ○○ & ○○.○ & ○○ & ○○.○ \\ \hline
                                    file=output)
                                cat("\\end{tabular}\n", file=output)                            # \end{tabular}
                                cat("\\end{table}\n", file=output)                              # \end{table}
                        }
                        else {                                                                  # tab で区切って出力する
                                cat("\n表　", v.name, "の度数分布\n\n", sep="", file=output)    # 表　○○の度数分布
                                cat("階級\t度数\t相対度数\t累積度数\t累積相対度数\n",           # 階級  度数相対度数    累積度数        累積相対度数
                                    file=output)
                                fmt <- paste(fmt, "\t%i\t%.1f\t%i\t%.1f\n", sep="")
                                for (j in 1:ln) {                                               # 階級の数だけ繰り返す
                                        cat(sprintf(fmt,                                        # 階級，度数
                                            name[j], count[j], freq[j], ccount[j], cfreq[j]),   # 相対度数，累積度数
                                            file=output)                                        # 累積相対度数
                                }
                                cat("合計", n, sprintf("%.1f\n", 100), sep="\t", file=output)   # 合計，○○，○○.○，○○，○○.○
                        }
                        plot(ans, main=main, xlab=xlab2, ylab=ylab)                             # ヒストグラムを描く（main, xlab, ylab を指定できる）
                }
        }

        if (output != "") {                                                     # 結果をファイルに出力した場合の後始末
                close(output)
        }
                                                                                # グラフをファイルに出力した場合の後始末
        if (plot != "") {                                                       # ファイルに出力しているなら，
                dev.off()                                                       # デバイスを閉じる
        }
}


使用例

set.seed(12345)
x <- rnorm(1000, mean=50, sd=10)
y <- factor(sample(4, 1000, replace=TRUE), labels=LETTERS[1:4])
df <- data.frame(x=x, y=y)
frequency(1:2, df, latex=FALSE, plot="A")

出力結果例


表　xの度数分布

項目	度数	相対度数	累積度数	累積相対度数
20～	4	0.4	4	0.4
25～	21	2.1	25	2.5
30～	41	4.1	66	6.6
35～	83	8.3	149	14.9
40～	137	13.7	286	28.6
45～	192	19.2	478	47.8
50～	190	19.0	668	66.8
55～	184	18.4	852	85.2
60～	74	7.4	926	92.6
65～	42	4.2	968	96.8
70～	28	2.8	996	99.6
75～	2	0.2	998	99.8
80～	2	0.2	1000	100.0
合計	1000	100.0

表　yの度数分布

項目	度数	相対度数
A	272	27.2
B	231	23.1
C	252	25.2
D	245	24.5
合計	1000	100.0
直前のページへ戻る　　
E-mail to Shigenobu AOKI