R -- 独立 k 標本

独立 k 標本　　　　　Last modified: Dec 12, 2012
目的

独立 k 標本データに対して，各群のデータ数，平均値，標準偏差を求める。
さらに，独立 2 標本の場合には，平均値の差の検定（t 検定）またはマン・ホイットニーの U 検定（ウィルコクソンの順位和検定），独立 3 標本以上の場合には，一元配置分散分析またはクラスカル・ウォリス検定を行うこともできる。

使用法

indep.sample <- function(i, j, df, latex=TRUE, captions=NULL, labels=NULL,
                         test=c("none", "parametric", "non-parametric"),
                         statistics=c("mean", "median"), var.equal=FALSE,
                         digits=3, output="", encoding=getOption("encoding"))

引数

i          分析対象となる変数が入っている列の番号
           c 関数を使った複数指定可能
j          群を表す変数 factor の列番号
df         読み込んだデータフレームの名前
latex      LaTeX ソースを出力する。Word 用なら，latex=FALSE にする
captions   latex=TRUE の場合，各表の表題を文字列ベクトルとして指定することができる（デフォルトではあり合わせの表題を付ける）。
labels     latex=TRUE の場合，各表の label を文字列ベクトルとして指定することができる（デフォルトでは付けない）。
test       検定法を指定する。"parametric", "non-parametric" の何れか
           "parametric" の場合，2 群の場合には t.test 関数，3 群の場合には oneway.test 関数が呼ばれる
           "non-parametric" の場合，2 群の場合には wilcox.test 関数，3 群の場合には kruskal.test 関数が呼ばれる
           デフォルトは検定をしない
statistics （平均値，標準偏差）を求める（"mean"）か（中央値，四分偏差）を求める（"median"）かを指定する
var.equal  t.test, oneway.test の var.equal 引数として使用する
digits     平均値，標準偏差を書き出すときに小数点以下何桁までにするかを指定
           デフォルトでは 3
output     出力コネクション
           デフルトはコンソールに出力
encoding   ファイルに出力するときのエンコーディング（デフォルトは OS による）

ソース

インストールは，以下の 1 行をコピーし，R コンソールにペーストする
source("http://aoki2.si.gunma-u.ac.jp/R/src/indep_sample.R", encoding="euc-jp")

#####
#
# 独立 k 標本の平均値，標準偏差を求め，必要なら平均値・代表値の差の検定を行う
#
#####

indep.sample <- function(i,                                                  # 分析対象の変数が入っているデータフレーム上の列番号または変数名ベクトル
                         j,                                                     # 群を表す変数が入っているデータフレーム上の列番号または変数名ベクトル
                         df,                                                    # データフレーム
                         latex=TRUE,                                            # LaTeX 形式で集計表を出力する（デフォルトは LaTeX 形式）
                         captions=NULL,                                         # latex=TRUE のときに，各表の表題を表す文字列ベクトルを指定できる（NULL のときはデフォルトの表題）
                         labels=NULL,                                           # latex=TRUE のときに，各表の label を表す文字列ベクトルを指定できる（NULL のときは付けない）
                         test=c("none", "parametric", "non-parametric"),        # デフォルト none では検定を行わない。検定を行うときはその種類を指定する
                         statistics=c("mean", "median"),                        # （平均値，標準偏差）を求めるか（中央値，四分偏差）を求めるかを指定する
                         var.equal=FALSE,                                       # parametric の場合に等分散性を仮定するかどうかの引数
                         digits=3,                                              # 平均値，標準偏差を表示するときの小数点以下の桁数
                         output="",                                             # ファイルに出力するときはファイル名（デフォルトはコンソールに出力）
                         encoding=getOption("encoding"))                        # ファイルに出力するときのエンコーディング（デフォルトは OS による）
{

# 下請け関数

        getNum <- function(str, df) {                                                # 変数名から列番号を得る
                names <- colnames(df)
                seq_along(names)[names %in% str]
        }

        SIQ <- function(x) return(diff(fivenum(x)[c(2,4)]))                     # 四分偏差を求める下請け関数

        indep.sample.sub <- function(ii, jj)
        {
                group <- colnames(df)[jj]                                    # 群を表す変数の名前
                df2 <- df[, c(ii, jj)]                                               # データフレームの列番号 ii, jj から 2 変数を取り出す
                df2 <- subset(df2, complete.cases(df2))                              # 欠損値を持つケースを除く
                x <- df2[, 1]                                                        # 分析対象変数
                g <- df2[, 2]                                                        # 群変数
                lst <- list(g)                                                       # by 関数を適用するために，群をリスト化する
                nt <- length(x)                                                      # 全体のサンプルサイズ
                mt <- MEAN(x)                                                        # 全体の平均値
                st <- SD(x)                                                  # 全体の標準偏差
                n <- by(x, lst, length)                                              # 各群のサンプルサイズ
                m <- by(x, lst, MEAN)                                                # 各群の平均値（中央値）
                s <- by(x, lst, SD)                                          # 各群の標準偏差（四分偏差）
                nr <- length(table(lst))                                     # 群の数
                if (latex) {                                                    # LaTeX 形式で集計結果を出力する
                        cat("\n\\begin{table}[htbp]\n", file=output)            # \begin{table}[htbp]
                        if (is.null(captions)) {
                                cat(sprintf("\\caption{%s別の%sの集計}\n",      # \caption{○○別の□□の集計}
                                    group, colnames(df2)[1]), file=output)
                        }
                        else {
                                cat(sprintf("\\caption{%s}\n", captions[index]), file=output)   # \caption{○○○○}
                        }
                        if (!is.null(labels)) {
                                cat(sprintf("\\label{%s}\n", labels[index]), file=output)       # \labels{○○○○}
                        }
                        cat("\\centering\n", file=output)                       # \centering
                        cat("\\begin{tabular}{lccc} \\hline\n",file=output)     # \begin{tabular}{lccc} \hline
                        cat(sprintf("& \\multicolumn{3}{c}{%s}\\\\ \\cline{2-4}\n", # \multicolumn{3}{c}{□□} \\ \cline{2-4}
                            colnames(df2)[1]), file=output)
                        cat(group, "データ数", M.str, S.str, sep=" & ",              # ○○ & データ数 & 平均値 & 標準偏差
                            file=output)
                        cat("\\\\ \\hline\n", file=output)                      # \\ \hline
                        for (l in 1:nr) {                                       # 各群について，
                                cat(names(n)[l], n[l], sprintf(format, m[l]),   # 群の名前 & 数値 & 数値 & 数値
                                    sprintf(format, s[l]), sep=" & ", file=output)
                                cat("\\\\", file=output)                        # \\
                                if (l == nr) cat("\\hline\n", file=output)      # 最後の群なら \hline そうでなければ何もなし
                                else cat("\n", file=output)
                        }
                        cat("全体", nt, sprintf(format, mt),                    # 全体 & 数値 & 数値 & 数値
                            sprintf(format, st), sep=" & ", file=output)
                        cat("\\\\ \\hline\n", file=output)                      # \\ \hline
                        cat("\\end{tabular}\n", file=output)                    # \end{tabular}
                }
                else {                                                          # tab で区切って出力する
                        cat("\n表　", group, "別の", colnames(df2)[1],          # 表　○○別の□□の集計
                            "の集計", sep="", file=output)
                        cat("\n", colnames(df2)[1], sep="\t", file=output,      # 　　□□
                            fill=TRUE)
                        cat(group, "データ数", M.str, S.str, sep="\t",          # ○○　データ数　平均値　標準偏差
                            file=output, fill=TRUE)
                        for (l in 1:nr) {                                       # 各群について，
                                cat(names(n)[l], n[l], sprintf(format, m[l]),   # 群の名前　数値　数値　数値
                                    sprintf(format, s[l]), sep="\t",
                                    file=output, fill=TRUE)
                        }
                        cat("全体", nt, sprintf(format, mt),                    # 全体　数値　数値　数値
                            sprintf(format, st), sep="\t",
                            file=output, fill=TRUE)
                }
                if (nr == 2) {                                                  # 2 群の場合には，
                        if (latex && test != "none") {                            # LaTeX 形式で出力するときには，改行して次の行に出力準備
                                cat("\\\\ \\noindent\n", file=output)
                        }
                        if (test == "parametric") {                             # 平均値の差の検定のために t.test 関数を使う
                                res <- t.test(x~g, var.equal=var.equal)              # t.test を呼ぶ
                                cat(sprintf(if (latex) "$t$値 = %.3f, 自由度 = %.3f, $P$値 = %.3f\n"
                                            else "t 値 = %.3f, 自由度 = %.3f, P 値 = %.3f\n",
                                            res$statistic, res$parameter, res$p.value), file=output)
                        }
                        else if (test == "non-parametric") {                    # マン・ホイットニーの U 検定
                                res <- wilcox.test(x~g)                              # wilcox.test を呼ぶ
                                cat(sprintf(if (latex) "$U$ = %.3f, $P$値 = %.3f\n"
                                            else "U = %.3f, P 値 = %.3f\n",
                                            res$statistic, res$p.value), file=output)
                        }
                }
                else if (nr >= 3) {
                        if (latex && test != "none") {                            # LaTeX 形式で出力するときには，改行して次の行に出力準備
                                cat("\\\\ \\noindent\n", file=output)
                        }
                        if (test == "parametric") {                             # 一元配置分散分析
                                res <- oneway.test(x ~ g, var.equal=var.equal)
                                cat(sprintf(if (latex) "$F$値 = %.3f, 自由度 = (%i, %.3f), $P$値 = %.3f\n"
                                            else "F 値 = %.3f, 自由度 = (%i, %.3f), P 値 = %.3f\n",
                                            res$statistic, res$parameter[1], res$parameter[2], res$p.value), file=output)
                        }
                        else if (test == "non-parametric") {                    # クラスカル・ウォリス検定
                                res <- kruskal.test(x~g)     
                                cat(sprintf(if (latex) "$\\chi_{kw}^2$ = %.3f, 自由度 = %i, $P$値 = %.3f\n"
                                            else "カイ二乗値(kw) = %.3f, 自由度 = %i, P 値 = %.3f\n", 
                                            res$statistic, res$parameter, res$p.value), file=output)
                        } 
                }
                if (latex) {                                                    # LaTeX 形式で集計結果を出力する場合は，
                        cat("\\end{table}\n", file=output)                      # \end{table}
                }
        }

# 関数本体
        if (output != "") {                                                     # 結果をファイルに出力する場合の処理
                output <- file(output, open="w", encoding=encoding)
        }

        test <- match.arg(test)                                                      # 引数が省略形で与えられたときに，正確な名前をとる
        statistics <- match.arg(statistics)                                  # 引数が省略形で与えられたときに，正確な名前をとる
        if (statistics == "mean") {
                MEAN <- mean                                                 # 位置の母数を求める関数：平均値
                SD <- sd                                                     # 散布度を求める関数：標準偏差
                M.str <- "平均値"
                S.str <- "標準偏差"
        }
        else {
                MEAN <- median                                                       # 位置の母数を求める関数：中央値
                SD <-  SIQ                                                   # 散布度を求める関数：四分偏差
                M.str <- "中央値"
                S.str <- "四分偏差"
        }
        format <- paste("%.", digits, "f", sep="")                           # 小数点以下 3 桁で出力する書式
        if (is.character(i[1])) {
                i <- getNum(i, df)
        }
        if (is.character(j[1])) {
                j <- getNum(j, df)
        }

        index <- 0
        for (jj in j) {                                                         # j はベクトルまたはスカラーで，群を表す変数がある列番号
                for (ii in i) {                                                 # i はベクトルまたはスカラーで，分析対象変数がある列番号
                        if (ii != jj) {                                         # i, j の全ての組み合わせについて（ii と jj が違うときのみ），
                                index <- index+1                             # 集計のための下請け関数 indep.sample.sub を呼ぶ
                                indep.sample.sub(ii, jj)
                        }
                }
        }

        if (output != "") {                                                     # 結果をファイルに出力した場合の後始末
                close(output)
        }
}


使用例

男女	血液型	測定値1	測定値2
1	2	101.3	135.8
1	2	101.8	165.7
1	1	122.6	178.9
2	4	114.4	155.8
1	4	89.7	143.0
2	2	83.4	123.7
1	2	93.2	151.3
2	1	99.6	108.8
1	3	127.5	101.6
2	4	104.7	156.1
2	4	84.2	173.7
2	2	90.6	122.6
1	1	90.0	102.2
1	1	97.6	176.7
2	4	97.4	125.0
1	1	92.8	172.8
2	3	103.4	146.5
2	2	101.2	159.0
2	2	87.8	148.0
2	1	95.5	140.5
1	2	89.0	171.8
2	3	94.3	121.2
1	3	69.9	114.9
1	3	79.4	160.0
1	3	105.7	98.1
1	2	95.2	133.5
2	4	113.7	86.5
2	2	106.8	148.9
2	2	112.1	206.1
2	1	91.7	163.0
のようなファイル test.dat があるとする

入力と変数の定義
df <- read.table("test.dat", header=TRUE)
df[,1] <- factor(df[,1], levels=1:2, labels=c("男", "女"))

ファイルに出力するとき
indep.sample(3:4, 1, df, latex=FALSE, output="ファイル名")

出力結果例

latex=FALSE の場合

> indep.sample(3:4, 1, df, latex=FALSE)

表　男女別の測定値1の集計
	測定値1
男女	データ数	平均値	標準偏差
男	14	96.836	15.060
女	16	98.800	9.969
全体	30	97.883	12.413

表　男女別の測定値2の集計
	測定値2
男女	データ数	平均値	標準偏差
男	14	143.307	29.535
女	16	142.838	28.217
全体	30	143.057	28.336

> indep.sample(3:4, 1, df, latex=FALSE, test="parametric")

表　男女別の測定値1の集計
	測定値1
男女	データ数	平均値	標準偏差
男	14	96.836	15.060
女	16	98.800	9.969
全体	30	97.883	12.413
t 値 = -0.415, 自由度 = 22.068, P 値 = 0.682

表　男女別の測定値2の集計
	測定値2
男女	データ数	平均値	標準偏差
男	14	143.307	29.535
女	16	142.838	28.217
全体	30	143.057	28.336
t 値 = 0.044, 自由度 = 27.085, P 値 = 0.965

> indep.sample(3:4, 1, df, latex=FALSE, test="parametric", var.equal=FALSE)

表　男女別の測定値1の集計
	測定値1
男女	データ数	平均値	標準偏差
男	14	96.836	15.060
女	16	98.800	9.969
全体	30	97.883	12.413
t 値 = -0.415, 自由度 = 22.068, P 値 = 0.682

表　男女別の測定値2の集計
	測定値2
男女	データ数	平均値	標準偏差
男	14	143.307	29.535
女	16	142.838	28.217
全体	30	143.057	28.336
t 値 = 0.044, 自由度 = 27.085, P 値 = 0.965

> indep.sample(3:4, 1, df, latex=FALSE, test="parametric", var.equal=TRUE)

表　男女別の測定値1の集計
	測定値1
男女	データ数	平均値	標準偏差
男	14	96.836	15.060
女	16	98.800	9.969
全体	30	97.883	12.413
t 値 = -0.426, 自由度 = 28.000, P 値 = 0.673

表　男女別の測定値2の集計
	測定値2
男女	データ数	平均値	標準偏差
男	14	143.307	29.535
女	16	142.838	28.217
全体	30	143.057	28.336
t 値 = 0.045, 自由度 = 28.000, P 値 = 0.965
latex=TRUE（デフォルト）の場合

> indep.sample(3, 1, df, test="non-parametric")

\begin{table}[htbp]
\caption{男女別の測定値1の集計}
\begin{center}
\begin{tabular}{lccc} \hline
& \multicolumn{3}{c}{測定値1}\\ \cline{2-4}
男女 & データ数 & 平均値 & 標準偏差\\ \hline
男 & 14 & 96.836 & 15.060\\
女 & 16 & 98.800 & 9.969\\\hline
全体 & 30 & 97.883 & 12.413\\ \hline
\end{tabular}
\\ \noindent
$U$ = 97.000, $P$値 = 0.552
\end{center}
\end{table}
直前のページへ戻る　　
E-mail to Shigenobu AOKI