<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>小生这厢有礼了(BioFaceBook Personal Blog) &#187; work</title>
	<atom:link href="http://www.biofacebook.com/?feed=rss2&#038;tag=work" rel="self" type="application/rss+xml" />
	<link>http://www.biofacebook.com</link>
	<description>记录生物信息学点滴足迹（NGS,Genome,Meta,Linux)</description>
	<lastBuildDate>Sun, 23 Aug 2020 03:28:53 +0000</lastBuildDate>
	<language>en-US</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>https://wordpress.org/?v=4.1.41</generator>
	<item>
		<title>Using prcomp/princomp for PCA in R （二）</title>
		<link>http://www.biofacebook.com/?p=590</link>
		<comments>http://www.biofacebook.com/?p=590#comments</comments>
		<pubDate>Fri, 31 Aug 2012 04:08:34 +0000</pubDate>
		<dc:creator><![CDATA[szypanther]]></dc:creator>
				<category><![CDATA[My project]]></category>
		<category><![CDATA[R语言]]></category>
		<category><![CDATA[统计学习]]></category>
		<category><![CDATA[R]]></category>
		<category><![CDATA[work]]></category>

		<guid isPermaLink="false">http://www.biofacebook.com/?p=590</guid>
		<description><![CDATA[<p>###############################</p> <p>PCA ############################### install.packages(&#8220;vegan&#8221;) library(vegan)</p> <p>&#62; STpcoa&#60;-read.table(file=&#8221;bactera_16s_final.subsample.phylip.tre1.weighted.phylip.pcoa.axes&#8221;, header=T,row.names=1) &#62; STpcoa axis1 axis2 axis3 axis4 Cellulose -0.020878 -0.234601 0.167454 0 Foodwaste -0.234592 0.221741 0.085802 0 Sludge 0.368882 0.100725 -0.010570 0 Xylan -0.113413 -0.087865 -0.242686 0 &#62;pl.STpcoa&#60;-princomp(STpcoa) &#62; summary(pl.STpcoa) Importance of components: Comp.1 Comp.2 Comp.3 Comp.4 Standard deviation 0.2260563 0.1746944 0.1536319 0 Proportion of Variance 0.4856521 0.2900347 [...]]]></description>
				<content:encoded><![CDATA[<p>###############################</p>
<p>PCA<br />
###############################<br />
install.packages(&#8220;vegan&#8221;)<br />
library(vegan)</p>
<p>&gt; STpcoa&lt;-read.table(file=&#8221;bactera_16s_final.subsample.phylip.tre1.weighted.phylip.pcoa.axes&#8221;, header=T,row.names=1)<br />
&gt; STpcoa<br />
axis1 axis2 axis3 axis4<br />
Cellulose -0.020878 -0.234601 0.167454 0<br />
Foodwaste -0.234592 0.221741 0.085802 0<br />
Sludge 0.368882 0.100725 -0.010570 0<br />
Xylan -0.113413 -0.087865 -0.242686 0<br />
&gt;pl.STpcoa&lt;-princomp(STpcoa)<br />
&gt; summary(pl.STpcoa)<br />
Importance of components:<br />
Comp.1 Comp.2 Comp.3 Comp.4<br />
Standard deviation 0.2260563 0.1746944 0.1536319 0<br />
Proportion of Variance 0.4856521 0.2900347 0.2243133 0<br />
Cumulative Proportion 0.4856521 0.7756867 1.0000000 1</p>
<p>&gt; ls(pl.STpcoa)<br />
[1] &#8220;call&#8221; &#8220;center&#8221; &#8220;loadings&#8221; &#8220;n.obs&#8221; &#8220;scale&#8221; &#8220;scores&#8221; &#8220;sdev&#8221;<br />
&gt; class(pl.STpcoa)<br />
[1] &#8220;princomp&#8221;<br />
&gt; nmds.col&lt;-c(rep(&#8220;green&#8221;, 1), rep(&#8220;blue&#8221;, 1), rep(&#8220;black&#8221;,1), rep(&#8220;red&#8221;,1))<br />
&gt; plot(pl.STpcoa$scores, col=nmds.col, pch=20)<br />
&gt; legend(x=0.12, y=0.25, c(&#8220;Cellulose&#8221;,&#8221;Foodwaste&#8221;,&#8221;Sludge&#8221;,&#8221;Xylan&#8221;),c(&#8220;green&#8221;,&#8221;blue&#8221;,&#8221;black&#8221;,&#8221;red&#8221;),bty=&#8221;n&#8221;)</p>
<p><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/pca-comp-8-31.png"><img class="alignleft size-full wp-image-591" title="pca-comp-8-31" src="http://www.biofacebook.com/wp-content/uploads/2012/08/pca-comp-8-31.png" alt="" width="750" height="400" /></a><br />
&gt; biplot(pl.STpcoa)</p>
<p><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/bioCOMP.jpeg"><img class="alignleft size-full wp-image-592" title="bioCOMP" src="http://www.biofacebook.com/wp-content/uploads/2012/08/bioCOMP.jpeg" alt="" width="750" height="400" /></a><br />
##############################<br />
&gt; pl2.STpcoa&lt;-prcomp(STpcoa)<br />
&gt; class(pl2.STpcoa)<br />
[1] &#8220;prcomp&#8221;<br />
&gt; pl2.STpcoa$sd^2<br />
[1] 0.06813525 0.04069083 0.03147035 0.00000000<br />
&gt; summary(pl2.STpcoa)<br />
Importance of components:<br />
PC1 PC2 PC3 PC4<br />
Standard deviation 0.2610 0.2017 0.1774 0<br />
Proportion of Variance 0.4857 0.2900 0.2243 0<br />
Cumulative Proportion 0.4857 0.7757 1.0000 1<br />
&gt; pl2.STpcoa$x<br />
PC1 PC2 PC3 PC4<br />
Cellulose -0.02087762 -0.2346017 0.16745306 0<br />
Foodwaste -0.23459165 0.2217407 0.08580311 0<br />
Sludge 0.36888225 0.1007250 -0.01056992 0<br />
Xylan -0.11341297 -0.0878640 -0.24268626 0<br />
&gt; pl2.STpcoa$rotation<br />
PC1 PC2 PC3 PC4<br />
axis1 1.000000e+00 -9.352971e-08 -8.835157e-07 0<br />
axis2 9.353330e-08 1.000000e+00 4.064575e-06 0<br />
axis3 8.835153e-07 -4.064576e-06 1.000000e+00 0<br />
axis4 0.000000e+00 0.000000e+00 0.000000e+00 1<br />
&gt; plot(pl2.STpcoa$x, col=nmds.col, pch=20)<br />
&gt; legend(x=0.12, y=0.25, c(&#8220;Cellulose&#8221;,&#8221;Foodwaste&#8221;,&#8221;Sludge&#8221;,&#8221;Xylan&#8221;),c(&#8220;green&#8221;,&#8221;blue&#8221;,&#8221;black&#8221;,&#8221;red&#8221;),bty=&#8221;n&#8221;)</p>
<p><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/pca-pc-8-31.png"><img class="alignleft size-full wp-image-593" title="pca-pc-8-31" src="http://www.biofacebook.com/wp-content/uploads/2012/08/pca-pc-8-31.png" alt="" width="750" height="400" /></a><br />
&gt; screeplot(pl2.STpcoa,type=&#8221;lines&#8221;,main=&#8221;Scree Plot&#8221;)</p>
<p><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/Scree-Plot-PC-8-31.jpeg"><img class="alignleft size-full wp-image-594" title="Scree-Plot-PC-8-31" src="http://www.biofacebook.com/wp-content/uploads/2012/08/Scree-Plot-PC-8-31.jpeg" alt="" width="750" height="400" /></a><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/bactera_pcoa.axes_.txt">bactera_16s_final.subsample.phylip.tre1.weighted.phylip.pcoa.axes</a></p>
]]></content:encoded>
			<wfw:commentRss>http://www.biofacebook.com/?feed=rss2&#038;p=590</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>Using prcomp/princomp for PCA in R （一）</title>
		<link>http://www.biofacebook.com/?p=587</link>
		<comments>http://www.biofacebook.com/?p=587#comments</comments>
		<pubDate>Fri, 31 Aug 2012 04:00:07 +0000</pubDate>
		<dc:creator><![CDATA[szypanther]]></dc:creator>
				<category><![CDATA[My project]]></category>
		<category><![CDATA[R语言]]></category>
		<category><![CDATA[统计学习]]></category>
		<category><![CDATA[R]]></category>
		<category><![CDATA[work]]></category>

		<guid isPermaLink="false">http://www.biofacebook.com/?p=587</guid>
		<description><![CDATA[<p>Difference between prcomp and princomp:</p> <p>&#8216;princomp&#8217; can only be used with more units than variables&#8221;</p> <p>prcomp是基于SVD分解（svd()函数，princomp是基于特征向量eigen()函数)</p> <p>Good video source:</p> <p>http://www.youtube.com/watch?v=oZ2nfIPdvjY</p> <p>http://www.youtube.com/watch?v=I5GxNzKLIoU&#038;feature=relmfu</p> <p>http://www.planta.cn/forum/viewtopic.php?t=16754&#038;highlight=%D3%EF%D1%D4</p> <p>###########################################</p> <p>以下所有代码包括练习数据，都可在R平台上直接运行。</p> <p>#主成分分析和主成分回归 主成分分析的思想是Pearson 1901年提出的，Hotelling 1933进一步发展 在R中，进行主成分分析用到princomp() 函数</p> <p>用法 princomp(x, cor = FALSE, scores = TRUE, covmat = NULL, subset = rep(TRUE, nrow(as.matrix(x))), &#8230;)</p> <p># 分析用数据 # cor 是否用样本的协方差矩阵作主成分分析 prcomp() 二 summary()函数 三 [...]]]></description>
				<content:encoded><![CDATA[<p>Difference between prcomp and princomp:</p>
<p>&#8216;princomp&#8217; can only be used with more units than variables&#8221;</p>
<p>prcomp是基于SVD分解（svd()函数，princomp是基于特征向量eigen()函数)</p>
<p>Good video source:</p>
<p>http://www.youtube.com/watch?v=oZ2nfIPdvjY</p>
<p>http://www.youtube.com/watch?v=I5GxNzKLIoU&#038;feature=relmfu</p>
<p>http://www.planta.cn/forum/viewtopic.php?t=16754&#038;highlight=%D3%EF%D1%D4</p>
<p>###########################################</p>
<p>以下所有代码包括练习数据，都可在R平台上直接运行。</p>
<p>#主成分分析和主成分回归<br />
主成分分析的思想是Pearson 1901年提出的，Hotelling 1933进一步发展<br />
在R中，进行主成分分析用到princomp() 函数</p>
<p>用法<br />
princomp(x, cor = FALSE, scores = TRUE, covmat = NULL,<br />
subset = rep(TRUE, nrow(as.matrix(x))), &#8230;)</p>
<p># 分析用数据<br />
# cor 是否用样本的协方差矩阵作主成分分析<br />
prcomp()<br />
二 summary()函数<br />
三 loadings()函数<br />
四 predict() 函数<br />
五 screeplot() 函数<br />
六 biplot() 函数</p>
<p>实例<br />
某中学随机抽取某年级30名学生，测量其身高，体重，胸围，坐高，针对这30名中学生身体四项指标数据做主成分分析。<br />
student&lt;-data.frame(<br />
X1=c(148, 139, 160, 149, 159, 142, 153, 150, 151, 139,<br />
140, 161, 158, 140, 137, 152, 149, 145, 160, 156,<br />
151, 147, 157, 147, 157, 151, 144, 141, 139, 148 ),<br />
X2=c(41, 34, 49, 36, 45, 31, 43, 43, 42, 31,<br />
29, 47, 49, 33, 31, 35, 47, 35, 47, 44,<br />
42, 38, 39, 30, 48, 36, 36, 30, 32, 38 ),<br />
X3=c(72, 71, 77, 67, 80, 66, 76, 77, 77, 68,<br />
64, 78, 78, 67, 66, 73, 82, 70, 74, 78,<br />
73, 73, 68, 65, 80, 74, 68, 67, 68, 70),<br />
X4=c(78, 76, 86, 79, 86, 76, 83, 79, 80, 74,<br />
74, 84, 83, 77, 73, 79, 79, 77, 87, 85,<br />
82, 78, 80, 75, 88, 80, 76, 76, 73, 78 )<br />
)<br />
#主成分分析<br />
student.pr &lt;- princomp(student, cor = TRUE)<br />
#显示结果<br />
summary(student.pr, loadings=TRUE)<br />
#预测，显示各样本主成分的值<br />
pre&lt;-predict(student.pr)<br />
#显示碎石图<br />
screeplot(student.pr,type=&#8221;lines&#8221;)<br />
# 主成分分析散点图<br />
biplot(student.pr)</p>
<p>例二<br />
对128个成年男子的身材进行测量，每人测得16项指标，身高，坐高，胸围，头高，裤长，下档，手长，领围，前胸，后背，肩厚，肩宽，袖长，肋围，腰围，腿肚，分别用X1-X16表示。16项指标的相关矩阵R。从相关矩阵出发进行主成分分析，随16项指标进行分类。<br />
命令<br />
x&lt;-c(<br />
1.00,<br />
0.79, 1.00,<br />
0.36, 0.31, 1.00,<br />
0.96, 0.74, 0.38, 1.00,<br />
0.89, 0.58, 0.31, 0.90, 1.00,<br />
0.79, 0.58, 0.30, 0.78, 0.79, 1.00,<br />
0.76, 0.55, 0.35, 0.75, 0.74, 0.73, 1.00,<br />
0.26, 0.19, 0.58, 0.25, 0.25, 0.18, 0.24, 1.00,<br />
0.21, 0.07, 0.28, 0.20, 0.18, 0.18, 0.29,-0.04, 1.00,<br />
0.26, 0.16, 0.33, 0.22, 0.23, 0.23, 0.25, 0.49,-0.34, 1.00,<br />
0.07, 0.21, 0.38, 0.08,-0.02, 0.00, 0.10, 0.44,-0.16, 0.23, 1.00,<br />
0.52, 0.41, 0.35, 0.53, 0.48, 0.38, 0.44, 0.30,-0.05, 0.50, 0.24, 1.00,<br />
0.77, 0.47, 0.41, 0.79, 0.79, 0.69, 0.67, 0.32, 0.23, 0.31, 0.10, 0.62, 1.00,<br />
0.25, 0.17, 0.64, 0.27, 0.27, 0.14, 0.16, 0.51, 0.21, 0.15, 0.31, 0.17, 0.26, 1.00,<br />
0.51, 0.35, 0.58, 0.57, 0.51, 0.26, 0.38, 0.51, 0.15, 0.29, 0.28, 0.41, 0.50, 0.63, 1.00,<br />
0.21, 0.16, 0.51, 0.26, 0.23, 0.00, 0.12, 0.38, 0.18, 0.14, 0.31, 0.18, 0.24, 0.50, 0.65, 1.00<br />
)<br />
names&lt;-c(&#8220;X1&#8243;, &#8220;X2&#8243;, &#8220;X3&#8243;, &#8220;X4&#8243;, &#8220;X5&#8243;, &#8220;X6&#8243;, &#8220;X7&#8243;, &#8220;X8&#8243;, &#8220;X9&#8243;,<br />
&#8220;X10&#8243;, &#8220;X11&#8243;, &#8220;X12&#8243;, &#8220;X13&#8243;, &#8220;X14&#8243;, &#8220;X15&#8243;, &#8220;X16&#8243;)<br />
R&lt;-matrix(0, nrow=16, ncol=16, dimnames=list(names, names))<br />
for (i in 1:16){<br />
for (j in 1:i){<br />
R&lt;-x[(i-1)*i/2+j]; R[j,i]&lt;-R<br />
}<br />
}<br />
#主成分分析<br />
pr&lt;-princomp(covmat=R)<br />
load&lt;-loadings(pr)</p>
<p>#<br />
plot(load[,1:2])<br />
text(load[,1], load[,2], adj=c(-0.4, 0.3))</p>
<p>主成分回归<br />
考虑进口总额Y与三个自变量：国内总产值，存储量，总消费量之间的关系。现收集了1949-1959共11年的数据，试做线性回归和主成分回归分析。<br />
conomy&lt;-data.frame(<br />
x1=c(149.3, 161.2, 171.5, 175.5, 180.8, 190.7, 202.1, 212.4, 226.1, 231.9, 239.0),<br />
x2=c(4.2, 4.1, 3.1, 3.1, 1.1, 2.2, 2.1, 5.6, 5.0, 5.1, 0.7),<br />
x3=c(108.1, 114.8, 123.2, 126.9, 132.1, 137.7, 146.0, 154.1, 162.3, 164.3, 167.6),<br />
y=c(15.9, 16.4, 19.0, 19.1, 18.8, 20.4, 22.7, 26.5, 28.1, 27.6, 26.3)<br />
)</p>
<p>线性回归<br />
lm.sol&lt;-lm(y~x1+x2+x3, data=conomy)<br />
summary(lm.sol)<br />
主成分回归</p>
<p># 主成分分析<br />
conomy.pr&lt;-princomp(~x1+x2+x3, data=conomy, cor=T)<br />
summary(conomy.pr, loadings=TRUE)<br />
pre&lt;-predict(conomy.pr)<br />
conomy$z1&lt;-pre[,1]; conomy$z2&lt;-pre[,2]<br />
lm.sol&lt;-lm(y~z1+z2, data=conomy)<br />
summary(lm.sol)</p>
<p><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/pca_135.jpeg"><img class="alignleft size-full wp-image-588" title="pca_135" src="http://www.biofacebook.com/wp-content/uploads/2012/08/pca_135.jpeg" alt="" width="558" height="557" /></a></p>
]]></content:encoded>
			<wfw:commentRss>http://www.biofacebook.com/?feed=rss2&#038;p=587</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>4sample CA RDA analysis</title>
		<link>http://www.biofacebook.com/?p=559</link>
		<comments>http://www.biofacebook.com/?p=559#comments</comments>
		<pubDate>Thu, 30 Aug 2012 04:14:47 +0000</pubDate>
		<dc:creator><![CDATA[szypanther]]></dc:creator>
				<category><![CDATA[My project]]></category>
		<category><![CDATA[R语言]]></category>
		<category><![CDATA[统计学习]]></category>
		<category><![CDATA[R]]></category>
		<category><![CDATA[work]]></category>

		<guid isPermaLink="false">http://www.biofacebook.com/?p=559</guid>
		<description><![CDATA[<p>&#160;</p> <p>&#62; gtsdata_test=read.table(&#8220;gtsdata.txt&#8221;, header=T) &#62; gtsenv=read.table(&#8220;gtsenv.txt&#8221;, header=T) &#62; gtsdata_data_t&#60;-t(gtsdata_data) &#62; decorana(gtsdata_data_t)</p> <p>Call: decorana(veg = gtsdata_data_t)</p> <p>Detrended correspondence analysis with 26 segments. Rescaling of axes with 4 iterations.</p> <p>DCA1 DCA2 DCA3 DCA4 Eigenvalues 0.8634 0.4834 0.23788 0 Decorana values 0.8721 0.3793 0.07223 0 Axis lengths 5.3292 2.1115 1.80907 0</p> <p>&#62; gts.ca=cca(gtsdata_data_t) &#62; gts.ca Call: cca(X = [...]]]></description>
				<content:encoded><![CDATA[<p>&nbsp;</p>
<p>&gt; gtsdata_test=read.table(&#8220;gtsdata.txt&#8221;, header=T)<br />
&gt; gtsenv=read.table(&#8220;gtsenv.txt&#8221;, header=T)<br />
&gt; gtsdata_data_t&lt;-t(gtsdata_data)<br />
&gt; decorana(gtsdata_data_t)</p>
<p>Call:<br />
decorana(veg = gtsdata_data_t)</p>
<p>Detrended correspondence analysis with 26 segments.<br />
Rescaling of axes with 4 iterations.</p>
<p>DCA1 DCA2 DCA3 DCA4<br />
Eigenvalues 0.8634 0.4834 0.23788 0<br />
Decorana values 0.8721 0.3793 0.07223 0<br />
Axis lengths 5.3292 2.1115 1.80907 0</p>
<p>&gt; gts.ca=cca(gtsdata_data_t)<br />
&gt; gts.ca<br />
Call: cca(X = gtsdata_data_t)</p>
<p>Inertia Rank<br />
Total 1.653<br />
Unconstrained 1.653 3<br />
Inertia is mean squared contingency coefficient</p>
<p>Eigenvalues for unconstrained axes:<br />
CA1 CA2 CA3<br />
0.8721 0.5037 0.2776</p>
<p>&gt; plot(gts.ca,scaling=3)</p>
<p><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/4sampleCA-analysis.png"><img class="alignleft size-full wp-image-560" title="4sampleCA-analysis" src="http://www.biofacebook.com/wp-content/uploads/2012/08/4sampleCA-analysis.png" alt="" width="750" height="400" /></a></p>
<p>&gt; gtsdata_data_t_del&lt;-gtsdata_data_t[1:3,]<br />
&gt; gtsdata_data_t_del</p>
<p>&gt; gts.rda=rda(gtsdata_data_t_del,gtsenv)<br />
&gt; gts.rda<br />
Call: rda(X = gtsdata_data_t_del, Y = gtsenv)</p>
<p>Inertia Proportion Rank<br />
Total 101790 1<br />
Constrained 101790 1 2<br />
Unconstrained 0 0 0<br />
Inertia is variance<br />
Some constraints were aliased because they were collinear (redundant)</p>
<p>Eigenvalues for constrained axes:<br />
RDA1 RDA2<br />
81240 20549</p>
<p>plot(gts.rda,display=c(&#8220;sp&#8221;,&#8221;bp&#8221;,&#8221;si&#8221;),scaling=3)</p>
<p><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/3sampleRDA-analysis.png"><img class="alignleft size-full wp-image-561" title="3sampleRDA-analysis" src="http://www.biofacebook.com/wp-content/uploads/2012/08/3sampleRDA-analysis.png" alt="" width="750" height="400" /></a></p>
<p><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/gtsenv2.txt">gtsenv.txt</a></p>
<p><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/4sample_R_cluster.csv">gtsdata.txt</a></p>
]]></content:encoded>
			<wfw:commentRss>http://www.biofacebook.com/?feed=rss2&#038;p=559</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>基于Vegan 软件包的生态学数据排序分析学习</title>
		<link>http://www.biofacebook.com/?p=531</link>
		<comments>http://www.biofacebook.com/?p=531#comments</comments>
		<pubDate>Tue, 28 Aug 2012 04:40:20 +0000</pubDate>
		<dc:creator><![CDATA[szypanther]]></dc:creator>
				<category><![CDATA[My project]]></category>
		<category><![CDATA[R语言]]></category>
		<category><![CDATA[统计学习]]></category>
		<category><![CDATA[bioinformatics]]></category>
		<category><![CDATA[R]]></category>
		<category><![CDATA[work]]></category>

		<guid isPermaLink="false">http://www.biofacebook.com/?p=531</guid>
		<description><![CDATA[<p>“基于Vegan 软件包的生态学数据排序分析 赖江山 米湘成 (中国科学院植物研究所植被与环境变化国家重点实验室，北京 100093) 摘要：群落学数据一般是多维数据，例如物种属性或环境因子的属性。多元统计分析是群落生态学常用的分析方法，排序（ordination）是多元统计最常用的方法之一。CANOCO是广泛使用的排序软件，但缺点是商业软件价格不菲，版本更新速度也很慢。近年来，R语言以其灵活、开放、易于掌握、免费等诸多优点，在生态学和生物多样性研究领域迅速赢得广大研究人员的青睐。R语言中的外在软件包“Vegan”是专门用于群落生态学分析的工具。Vegan能够提供所有基本的排序方法，同时具有生成精美排序图的功能，版本更新很快。我们认为Vegan包完全可以取代CANOCO，成为今后排序分析的首选统计工具。本文首先简述排序的原理和类型，然后介绍Vegan的基本信息和下载安装过程，最后以古田山24公顷样地内随机抽取40个20m×20m的样方为例，展示Vegan包内各种常用排序方法（PCA,RDA,CA和CCA）和排序图生成过程，希望能为R的初学者尽快熟悉并利用Vegan包进行排序分析提供参考。</p> <p>gtsdata</p> <p>gtsenv.txt</p> <p>赖江山.pdf</p> &#62; setwd("/winxp_disk2/shenzy/R/Vegan") &#62; gtsdata=read.table("gtsdata.txt", header=T) &#62; gtsenv=read.table("gtsenv.txt", header=T) &#62; install.packages("vegan") Installing package(s) into ‘/home/shenzy/R/x86_64-pc-linux-gnu-library/2.15’ (as ‘lib’ is unspecified) 试开URL’http://cran.csiro.au/src/contrib/vegan_2.0-4.tar.gz' Content type 'application/x-gzip' length 1576584 bytes (1.5 Mb) 打开了URL ================================================== downloaded 1.5 Mb * installing *source* package ‘vegan’ ... ** 成功将‘vegan’程序包解包并MD5和检查 ** libs gfortran -fpic -O3 [...]]]></description>
				<content:encoded><![CDATA[<p>“基于Vegan 软件包的生态学数据排序分析<br />
赖江山 米湘成<br />
(中国科学院植物研究所植被与环境变化国家重点实验室，北京 100093)<br />
摘要：群落学数据一般是多维数据，例如物种属性或环境因子的属性。多元统计分析是群落生态学常用的分析方法，排序（ordination）是多元统计最常用的方法之一。CANOCO是广泛使用的排序软件，但缺点是商业软件价格不菲，版本更新速度也很慢。近年来，R语言以其灵活、开放、易于掌握、免费等诸多优点，在生态学和生物多样性研究领域迅速赢得广大研究人员的青睐。R语言中的外在软件包“Vegan”是专门用于群落生态学分析的工具。Vegan能够提供所有基本的排序方法，同时具有生成精美排序图的功能，版本更新很快。我们认为Vegan包完全可以取代CANOCO，成为今后排序分析的首选统计工具。本文首先简述排序的原理和类型，然后介绍Vegan的基本信息和下载安装过程，最后以古田山24公顷样地内随机抽取40个20m×20m的样方为例，展示Vegan包内各种常用排序方法（PCA,RDA,CA和CCA）和排序图生成过程，希望能为R的初学者尽快熟悉并利用Vegan包进行排序分析提供参考。</p>
<p><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/gtsdata.txt">gtsdata</a></p>
<p><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/gtsenv1.txt">gtsenv.txt</a></p>
<p><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/赖江山.pdf">赖江山.pdf</a></p>
<pre>&gt; setwd("/winxp_disk2/shenzy/R/Vegan")
&gt; gtsdata=read.table("gtsdata.txt", header=T)
&gt; gtsenv=read.table("gtsenv.txt", header=T)
&gt; install.packages("vegan")
Installing package(s) into ‘/home/shenzy/R/x86_64-pc-linux-gnu-library/2.15’
(as ‘lib’ is unspecified)
试开URL’http://cran.csiro.au/src/contrib/vegan_2.0-4.tar.gz'
Content type 'application/x-gzip' length 1576584 bytes (1.5 Mb)
打开了URL
==================================================
downloaded 1.5 Mb

* installing *source* package ‘vegan’ ...
** 成功将‘vegan’程序包解包并MD5和检查
** libs
gfortran   -fpic  -O3 -pipe  -g  -c cepin.f -o cepin.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG      -fpic  -O3 -pipe  -g  -c data2hill.c -o data2hill.o
gfortran   -fpic  -O3 -pipe  -g  -c decorana.f -o decorana.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG      -fpic  -O3 -pipe  -g  -c goffactor.c -o goffactor.o
gfortran   -fpic  -O3 -pipe  -g  -c monoMDS.f -o monoMDS.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG      -fpic  -O3 -pipe  -g  -c nestedness.c -o nestedness.o
gfortran   -fpic  -O3 -pipe  -g  -c ordering.f -o ordering.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG      -fpic  -O3 -pipe  -g  -c pnpoly.c -o pnpoly.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG      -fpic  -O3 -pipe  -g  -c stepacross.c -o stepacross.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG      -fpic  -O3 -pipe  -g  -c vegdist.c -o vegdist.o
gcc -std=gnu99 -shared -o vegan.so cepin.o data2hill.o decorana.o goffactor.o monoMDS.o nestedness.o ordering.o pnpoly.o stepacross.o vegdist.o -lgfortran -lm -lquadmath -L/usr/lib/R/lib -lR
安装至 /home/shenzy/R/x86_64-pc-linux-gnu-library/2.15/vegan/libs
** R
** data
** inst
** preparing package for lazy loading
** help
*** installing help indices
** building package indices
** installing vignettes
   ‘decision-vegan.Rnw’
   ‘diversity-vegan.Rnw’ using ‘UTF-8’
   ‘intro-vegan.Rnw’ using ‘UTF-8’
** testing if installed package can be loaded

* DONE (vegan)

The downloaded source packages are in
	‘/tmp/RtmpmtXtEK/downloaded_packages’
&gt; library(vegan)
载入需要的程辑包：permute

载入程辑包：‘permute’

The following object(s) are masked from ‘package:gtools’:

    permute

This is vegan 2.0-4
&gt; decorana(gtsdata)

Call:
decorana(veg = gtsdata) 

Detrended correspondence analysis with 26 segments.
Rescaling of axes with 4 iterations.

                  DCA1   DCA2    DCA3    DCA4
Eigenvalues     0.3939 0.2239 0.09555 0.06226
Decorana values 0.5025 0.1756 0.06712 0.03877
Axis lengths    3.2595 2.5130 1.21445 1.00854

&gt; gts.pca=rda(gtsdata)
&gt; gts.pca
Call: rda(X = gtsdata)

              Inertia Rank
Total           352.1
Unconstrained   352.1   22
Inertia is variance 

Eigenvalues for unconstrained axes:
    PC1     PC2     PC3     PC4     PC5     PC6     PC7     PC8
111.779  73.580  54.607  32.959  26.481  18.063  12.763   7.637
(Showed only 8 of all 22 unconstrained eigenvalues)

Note: 通过以上命令选择排序模型（线性模型PCA、RDA或单峰模型CA、CCA），因为Axis lengths 等同于CANOCO中的DCA分析，
DCA排序数值最大max&gt;4选单峰，&lt;3 选线性模型， 3&lt;max&lt;4 则都二者都行，这里3.2595属于此情况！总的特征根为352.1，即物种分布
的总变化量。PCA排序为每个抽所能解释的方差变化量。对于第一抽而言，111.779/352.1=31.7% 为其物种分布的解释量。</pre>
<pre>&gt; plot(gts.pca)
<a href="http://www.biofacebook.com/wp-content/uploads/2012/08/Rplot0.png"><img class="alignleft size-full wp-image-538" title="Rplot0" src="http://www.biofacebook.com/wp-content/uploads/2012/08/Rplot0.png" alt="" width="750" height="400" /></a></pre>
<p>&nbsp;</p>
<pre> &gt;biplot(gts.pca)
<a href="http://www.biofacebook.com/wp-content/uploads/2012/08/Rplot1.png"><img class="alignleft size-full wp-image-539" title="Rplot1" src="http://www.biofacebook.com/wp-content/uploads/2012/08/Rplot1.png" alt="" width="750" height="400" /></a></pre>
<p>因以上重叠现象严重，原因是物种分布差异打，分布不均匀的物种占据了大部分排序空间，可对物种数据进行单位方差标准化。通过scale参数实现，如下：</p>
<pre>&gt; gts.pca=rda(gtsdata, scale=T)
&gt; biplot(gts.pca,scaling=3)
<a href="http://www.biofacebook.com/wp-content/uploads/2012/08/Rplot2.png"><img class="alignleft size-full wp-image-542" title="Rplot2" src="http://www.biofacebook.com/wp-content/uploads/2012/08/Rplot2.png" alt="" width="750" height="400" /></a></pre>
<pre>Note:scaling=1 关注物种间关系</pre>
<pre>scaling=2 关注样方之间关系
scaling=3 关注样方与物种之间关系</pre>
<pre>&gt; biplot(gts.pca,display="sp")</pre>
<pre>&gt; biplot(gts.pca,display="si")</pre>
<pre>&gt; biplot(gts.pca,display="sp", choices=c(1,3))
<a href="http://www.biofacebook.com/wp-content/uploads/2012/08/Rplot3.png"><img class="alignleft size-full wp-image-544" title="Rplot3" src="http://www.biofacebook.com/wp-content/uploads/2012/08/Rplot3.png" alt="" width="750" height="400" /></a></pre>
<pre></pre>
<p>CA分析：</p>
<pre>&gt; gts.ca=cca(gtsdata)
&gt; gts.ca
Call: cca(X = gtsdata)

              Inertia Rank
Total           1.424
Unconstrained   1.424   21
Inertia is mean squared contingency coefficient 

Eigenvalues for unconstrained axes:
    CA1     CA2     CA3     CA4     CA5     CA6     CA7     CA8
0.50253 0.26564 0.14023 0.10502 0.09127 0.05540 0.05063 0.04204
(Showed only 8 of all 21 unconstrained eigenvalues)</pre>
<pre>&gt; plot(gts.ca,scaling=3)
<a href="http://www.biofacebook.com/wp-content/uploads/2012/08/Rplot4.png"><img class="alignleft size-full wp-image-546" title="Rplot4" src="http://www.biofacebook.com/wp-content/uploads/2012/08/Rplot4.png" alt="" width="750" height="400" /></a></pre>
<pre>从CA解读即：如某一个物种靠近某个样方，表明该物种可能对样方位置起很大作用。从图可以看出20号样方与短柄饱（QUESER）很近。同时19与20号样方距离近，表明物种组结构特征也近！而只有少数样方出现的物种，如CASCAR，通常在排序空间边缘，表明只偶然发生。该列对应样方数值都很小或0！对在排序中心的物种，可能在取样区域是其最优分布。对应该列（CASERY）数值较大而多！</pre>
<pre>
RDA分析（多个矩阵分析）：</pre>
<pre>&gt; gts.rda=rda(gtsdata,gtsenv)
&gt; gts.rda
Call: rda(X = gtsdata, Y = gtsenv)

               Inertia Proportion Rank
Total         352.0917     1.0000
Constrained   137.4026     0.3902    8
Unconstrained 214.6891     0.6098   22
Inertia is variance 

Eigenvalues for constrained axes:
   RDA1    RDA2    RDA3    RDA4    RDA5    RDA6    RDA7    RDA8
56.3864 42.7769 17.8270 13.5066  2.5020  2.1217  1.6616  0.6203 

Eigenvalues for unconstrained axes:
   PC1    PC2    PC3    PC4    PC5    PC6    PC7    PC8
72.287 54.891 26.618 17.959 12.730  9.918  5.659  5.349
(Showed only 8 of all 22 unconstrained eigenvalues)</pre>
<pre>plot(gts.rda,display=c("sp","bp","si"),scaling=3)
<a href="http://www.biofacebook.com/wp-content/uploads/2012/08/Rplot6.png"><img class="alignleft size-full wp-image-548" title="Rplot6" src="http://www.biofacebook.com/wp-content/uploads/2012/08/Rplot6.png" alt="" width="750" height="400" /></a></pre>
<pre>在RDA排序中，箭头连线长度代表某个</pre>
<pre>环境因子与群落分布和种类分布间相关
程度的大小，越长相关性越大。
箭头连线和排序抽的夹角代表某个环境因子
与排序抽的相关性大小，越小相关性越大！</pre>
<pre>&gt; gts.prda=rda(gtsdata,gtsenv[,1:4], gtsenv[,5:8])
&gt; gts.prda
Call: rda(X = gtsdata, Y = gtsenv[, 1:4], Z = gtsenv[, 5:8])

               Inertia Proportion Rank
Total         352.0917     1.0000
Conditional    95.0318     0.2699    4
Constrained    42.3708     0.1203    4
Unconstrained 214.6891     0.6098   22
Inertia is variance 

Eigenvalues for constrained axes:
  RDA1   RDA2   RDA3   RDA4
27.522  9.087  3.442  2.320 

Eigenvalues for unconstrained axes:
   PC1    PC2    PC3    PC4    PC5    PC6    PC7    PC8
72.287 54.891 26.618 17.959 12.730  9.918  5.659  5.349
(Showed only 8 of all 22 unconstrained eigenvalues)
Note: gtsenv[,1:4]表示环境矩阵只取前4列，即地形因子。Constrained为42.37除以
352.09=12.03%，表示地形因子单独所能解释的特征根占总特征根的百分比。Y，Z调换下，
可得土壤因子单独的解释量，2者总共的解释量前面已经算出，即为39.02%。所以2组环境
变量共同的解释量为39.02%-15.53%-12.03%=11.46%!

CCA分析类似
&gt;gts.cca=cca(gtsdata,gtsenv)</pre>
<pre></pre>
]]></content:encoded>
			<wfw:commentRss>http://www.biofacebook.com/?feed=rss2&#038;p=531</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>R OTU heatmap2</title>
		<link>http://www.biofacebook.com/?p=521</link>
		<comments>http://www.biofacebook.com/?p=521#comments</comments>
		<pubDate>Thu, 23 Aug 2012 07:24:35 +0000</pubDate>
		<dc:creator><![CDATA[szypanther]]></dc:creator>
				<category><![CDATA[My project]]></category>
		<category><![CDATA[R语言]]></category>
		<category><![CDATA[R]]></category>
		<category><![CDATA[work]]></category>

		<guid isPermaLink="false">http://www.biofacebook.com/?p=521</guid>
		<description><![CDATA[<p>source(&#8220;http://www.bioconductor.org/biocLite.R&#8221;); biocLite(&#8220;affy&#8221;); biocLite(&#8220;Biobase&#8221;); library(affy); library(Biobase);</p> <p>&#62;bac_4sampledata=read.csv(&#8220;/home/R_heatmap/4sample_R_cluster.csv&#8221;, sep=&#8221;\t&#8221;) &#62; row.names(bac_4sampledata)&#60;-bac_4sampledata$Group &#62; bac_4sample_Datamatrix&#60;-data.matrix(bac_4sampledata[,2:5]) &#62; heatmap.2(bac_4sample_Datamatrix, distfun=dist,col=greenred(256), scale=&#8221;row&#8221;, key=TRUE, symkey=FALSE, density.info=&#8221;none&#8221;, trace=&#8221;none&#8221;, cexRow=0.5, cexCol=0.7,margin=c(7,30), keysize=1.5);</p> <p>4sample_R_cluster_stdtop100</p> &#62; heatmap.2(bac_4sample_Datamatrix, distfun = function(x) dist(x,method = 'euclidean'),hclustfun = function(x) hclust(x,method = 'centroid'),col=greenred(256), scale="row", key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5, cexCol=0.7,margin=c(7,30), keysize=1.5); [...]]]></description>
				<content:encoded><![CDATA[<p>source(&#8220;http://www.bioconductor.org/biocLite.R&#8221;);<br />
biocLite(&#8220;affy&#8221;);<br />
biocLite(&#8220;Biobase&#8221;);<br />
library(affy);<br />
library(Biobase);</p>
<p>&gt;bac_4sampledata=read.csv(&#8220;/home/R_heatmap/4sample_R_cluster.csv&#8221;, sep=&#8221;\t&#8221;)<br />
&gt; row.names(bac_4sampledata)&lt;-bac_4sampledata$Group<br />
&gt; bac_4sample_Datamatrix&lt;-data.matrix(bac_4sampledata[,2:5])<br />
&gt; heatmap.2(bac_4sample_Datamatrix, distfun=dist,col=greenred(256), scale=&#8221;row&#8221;, key=TRUE, symkey=FALSE, density.info=&#8221;none&#8221;, trace=&#8221;none&#8221;, cexRow=0.5, cexCol=0.7,margin=c(7,30), keysize=1.5);</p>
<p><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/4sample_R_cluster_stdtop100.csv">4sample_R_cluster_stdtop100</a></p>
<pre><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/4STbac-sample-heatmap.png"><img class="alignleft size-full wp-image-522" title="4STbac-sample-heatmap" src="http://www.biofacebook.com/wp-content/uploads/2012/08/4STbac-sample-heatmap.png" alt="" width="750" height="500" /></a></pre>
<pre> &gt; heatmap.2(bac_4sample_Datamatrix, distfun = function(x) dist(x,method = 'euclidean'),hclustfun = function(x) hclust(x,method = 'centroid'),col=greenred(256), scale="row", key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5, cexCol=0.7,margin=c(7,30), keysize=1.5);</pre>
<pre><a href="http://www.biofacebook.com/wp-content/uploads/2012/08/4dsampleRplot.png"><img class="alignleft size-full wp-image-526" title="4dsampleRplot" src="http://www.biofacebook.com/wp-content/uploads/2012/08/4dsampleRplot.png" alt="" width="750" height="500" /></a></pre>
]]></content:encoded>
			<wfw:commentRss>http://www.biofacebook.com/?feed=rss2&#038;p=521</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>454 pyrosequencing analysis pipeline</title>
		<link>http://www.biofacebook.com/?p=494</link>
		<comments>http://www.biofacebook.com/?p=494#comments</comments>
		<pubDate>Thu, 16 Aug 2012 08:27:16 +0000</pubDate>
		<dc:creator><![CDATA[szypanther]]></dc:creator>
				<category><![CDATA[My project]]></category>
		<category><![CDATA[metagenome]]></category>
		<category><![CDATA[work]]></category>

		<guid isPermaLink="false">http://www.biofacebook.com/?p=494</guid>
		<description><![CDATA[<p>mothur &#62; sffinfo(sff=454Reads_archaea.sff, flow=T) Extracting info from 454Reads_archaea.sff &#8230; 10000 20000 30000 40000 50000 60000 70000 80000 90000 92115 It took 68 secs to extract 92115. Output File Names: 454Reads_archaea.fasta 454Reads_archaea.qual 454Reads_archaea.flow</p> <p>mothur &#62; trim.flows(flow=454Reads_archaea.flow, oligos=oligos_LXY.txt, pdiffs=2, bdiffs=1, processors=2) Appending files from process 15674</p> <p>Output File Names: 454Reads_archaea.trim.flow 454Reads_archaea.scrap.flow 454Reads_archaea.GZ_ARC.flow 454Reads_archaea.GZ1122_ARC.flow 454Reads_archaea.GZ1122cellulose_ARC.flow 454Reads_archaea.GZ_xylan_ARC.flow 454Reads_archaea.GZ_cellulose55_ARC.flow 454Reads_archaea.SHX_xylan_ARC.flow [...]]]></description>
				<content:encoded><![CDATA[<p>mothur &gt; sffinfo(sff=454Reads_archaea.sff, flow=T)<br />
Extracting info from 454Reads_archaea.sff &#8230;<br />
10000<br />
20000<br />
30000<br />
40000<br />
50000<br />
60000<br />
70000<br />
80000<br />
90000<br />
92115<br />
It took 68 secs to extract 92115.<br />
Output File Names:<br />
454Reads_archaea.fasta<br />
454Reads_archaea.qual<br />
454Reads_archaea.flow</p>
<p>mothur &gt; trim.flows(flow=454Reads_archaea.flow, oligos=oligos_LXY.txt, pdiffs=2, bdiffs=1, processors=2)<br />
Appending files from process 15674</p>
<p>Output File Names:<br />
454Reads_archaea.trim.flow<br />
454Reads_archaea.scrap.flow<br />
454Reads_archaea.GZ_ARC.flow<br />
454Reads_archaea.GZ1122_ARC.flow<br />
454Reads_archaea.GZ1122cellulose_ARC.flow<br />
454Reads_archaea.GZ_xylan_ARC.flow<br />
454Reads_archaea.GZ_cellulose55_ARC.flow<br />
454Reads_archaea.SHX_xylan_ARC.flow<br />
454Reads_archaea.GZ_xylose_ARC.flow<br />
454Reads_archaea.Eric_ARC.flow<br />
454Reads_archaea.Milk_D_ARC.flow<br />
454Reads_archaea.Milk_E_ARC.flow<br />
454Reads_archaea.ST1219_ARC.flow<br />
454Reads_archaea.YL_ARC.flow<br />
454Reads_archaea.SHX_xylose_ARC.flow<br />
454Reads_archaea.SHX_cellulose55_ARC.flow<br />
454Reads_archaea.TP_1201_ARC.flow<br />
454Reads_archaea.ST_ARC.flow<br />
454Reads_archaea.YL0203cellulose_ARC.flow<br />
454Reads_archaea.TP_xylan_ARC.flow<br />
454Reads_archaea.ST0303cellulose_ARC.flow<br />
454Reads_archaea.SHX_ARC.flow<br />
454Reads_archaea.ST_xylan_ARC.flow<br />
454Reads_archaea.YL_xylan_ARC.flow<br />
454Reads_archaea.SHX1219_ARC.flow<br />
454Reads_archaea.SHX1125cellulose_ARC.flow<br />
454Reads_archaea.flow.files</p>
<p>mothur &gt; shhh.flows(file=454Reads_archaea.flow.files, processors=4)</p>
<p>mothur &gt; trim.seqs(fasta=454Reads_archaea.shhh.fasta, name=454Reads_archaea.shhh.names, oligos=oligos_LXY.txt, pdiffs=2, bdiffs=1, maxhomop=8, minlength=150, flip=T, processors=2)</p>
<p>Total of all groups is 44091</p>
<p>Output File Names:<br />
454Reads_archaea.shhh.trim.fasta<br />
454Reads_archaea.shhh.scrap.fasta<br />
454Reads_archaea.shhh.trim.names<br />
454Reads_archaea.shhh.scrap.names<br />
454Reads_archaea.shhh.groups</p>
<p>mothur &gt; summary.seqs(fasta=454Reads_archaea.shhh.trim.fasta, name=454Reads_archaea.shhh.trim.names)<br />
Using 2 processors.<br />
Start End NBases Ambigs Polymer NumSeqs<br />
Minimum: 1 218 218 0 3 1<br />
2.5%-tile: 1 251 251 0 3 1103<br />
25%-tile: 1 268 268 0 4 11023<br />
Median: 1 274 274 0 4 22046<br />
75%-tile: 1 281 281 0 4 33069<br />
97.5%-tile: 1 297 297 0 5 42989<br />
Maximum: 1 333 333 0 8 44091<br />
Mean: 1 273.837 273.837 0 4.15944<br />
# of unique seqs: 12780<br />
total # of seqs: 44091</p>
<p>Output File Name:<br />
454Reads_archaea.shhh.trim.summary</p>
<p>mothur &gt; unique.seqs(fasta=454Reads_archaea.shhh.trim.fasta, name=454Reads_archaea.shhh.trim.names)</p>
<p>1000 959<br />
2000 1691<br />
3000 2431<br />
4000 3358<br />
5000 4352<br />
6000 5335<br />
7000 6328<br />
8000 7261<br />
9000 8187<br />
10000 9082<br />
11000 9963<br />
12000 10859<br />
12780 11449</p>
<p>Output File Names:<br />
454Reads_archaea.shhh.trim.unique.fasta<br />
454Reads_archaea.shhh.trim.unique.names</p>
<p>mothur &gt; summary.seqs(fasta=454Reads_archaea.shhh.trim.unique.fasta, name=454Reads_archaea.shhh.trim.unique.names)<br />
Using 2 processors.</p>
<p>Start End NBases Ambigs Polymer NumSeqs<br />
Minimum: 1 218 218 0 3 1<br />
2.5%-tile: 1 251 251 0 3 1103<br />
25%-tile: 1 268 268 0 4 11023<br />
Median: 1 274 274 0 4 22046<br />
75%-tile: 1 281 281 0 4 33069<br />
97.5%-tile: 1 297 297 0 5 42989<br />
Maximum: 1 333 333 0 8 44091<br />
Mean: 1 273.837 273.837 0 4.15944<br />
# of unique seqs: 11449<br />
total # of seqs: 44091</p>
<p>Output File Name:<br />
454Reads_archaea.shhh.trim.unique.summary</p>
<p>Submit to RDP database, check and filter bacteria sequences!</p>
<p>http://rdp.cme.msu.edu/classifier/cl_status.jsp</p>
<p>domain Bacteria (1435 sequences)</p>
<p>shenzy@shenzy-ubuntu:/winxp_disk2/shenzy/xiaoying_work_archaea/16s_archaea_allsamples/shhh_pipe2$ wc allrank_454Reads_archaea.shhh.trim.unique.fasta_classified.txt<br />
1435 1520 200784 allrank_454Reads_archaea.shhh.trim.unique.fasta_classified.txt</p>
<p>./filter_bacterseqs_for_align.py -i allrank_454Reads_archaea.shhh.trim.unique.fasta_classified.txt -f 454Reads_archaea.shhh.trim.unique.fasta -n 454Reads_archaea.shhh.trim.unique.names -g 454Reads_archaea.shhh.groups</p>
<p>shenzy@shenzy-ubuntu:/winxp_disk2/shenzy/xiaoying_work_archaea/16s_archaea_allsamples/shhh_pipe2$ wc 454Reads_archaea.shhh.groups.filter<br />
42187 84374 1224728 454Reads_archaea.shhh.groups.filter<br />
shenzy@shenzy-ubuntu:/winxp_disk2/shenzy/xiaoying_work_archaea/16s_archaea_allsamples/shhh_pipe2$ wc 454Reads_archaea.shhh.groups<br />
44091 88182 1270431 454Reads_archaea.shhh.groups</p>
<p>mothur &gt; summary.seqs(fasta=454Reads_archaea.shhh.trim.unique.fasta.filter, name=454Reads_archaea.shhh.trim.unique.names.filter)<br />
Using 2 processors.</p>
<p>Start End NBases Ambigs Polymer NumSeqs<br />
Minimum: 1 218 218 0 3 1<br />
2.5%-tile: 1 256 256 0 3 1055<br />
25%-tile: 1 268 268 0 4 10547<br />
Median: 1 274 274 0 4 21094<br />
75%-tile: 1 282 282 0 4 31641<br />
97.5%-tile: 1 297 297 0 5 41133<br />
Maximum: 1 333 333 0 8 42187<br />
Mean: 1 274.543 274.543 0 4.15355<br />
# of unique seqs: 10014<br />
total # of seqs: 42187<br />
mothur &gt; screen.seqs(fasta=454Reads_archaea.shhh.trim.unique.fasta.align, name=454Reads_archaea.shhh.trim.unique.names.filter, group=454Reads_archaea.shhh.groups.filter, processors=2)<br />
Output File Name:<br />
454Reads_archaea.shhh.trim.unique.fasta.summary<br />
###mothur &gt; align.seqs(candidate=454Reads_archaea.shhh.trim.unique.fasta.filter, template=core_set_aligned.imputed.fasta, flip=T, ksize=9, align=needleman, gapopen=-1, processors=3)<br />
###</p>
<p>mothur &gt; align.seqs(fasta=454Reads_archaea.shhh.trim.unique.fasta.filter, reference=core_set_aligned.fasta.imputed, flip=T, processors=3)<br />
Using 3 processors.</p>
<p>Reading in the core_set_aligned.fasta.imputed template sequences&#8230; DONE.<br />
It took 1 to read 4938 sequences.<br />
Aligning sequences from 454Reads_archaea.shhh.trim.unique.fasta.filter &#8230;<br />
100<br />
&#8230;<br />
3338<br />
Some of you sequences generated alignments that eliminated too many bases, a list is provided in 454Reads_archaea.shhh.trim.unique.fasta.flip.accnos. If the reverse compliment proved to be better it was reported.<br />
It took 60 secs to align 10014 sequences.<br />
Output File Names:<br />
454Reads_archaea.shhh.trim.unique.fasta.align<br />
454Reads_archaea.shhh.trim.unique.fasta.align.report<br />
454Reads_archaea.shhh.trim.unique.fasta.flip.accnos</p>
<p>mothur &gt; summary.seqs(fasta=454Reads_archaea.shhh.trim.unique.fasta.align, name=454Reads_archaea.shhh.trim.unique.names.filter)<br />
Using 3 processors.</p>
<p>Start End NBases Ambigs Polymer NumSeqs<br />
Minimum: 86 98 2 0 1 1<br />
2.5%-tile: 132 1746 51 0 3 1055<br />
25%-tile: 136 1822 268 0 4 10547<br />
Median: 136 1834 274 0 4 21094<br />
75%-tile: 136 1850 282 0 4 31641<br />
97.5%-tile: 194 1887 297 0 5 41133<br />
Maximum: 6858 6885 313 0 8 42187<br />
Mean: 284.168 1920.46 266.145 0 4.10781<br />
# of unique seqs: 10014<br />
total # of seqs: 42187</p>
<p>Output File Name:<br />
454Reads_archaea.shhh.trim.unique.fasta.summary<br />
##mothur &gt; screen.seqs(fasta=454Reads_archaea.shhh.trim.unique.fasta.align, name=454Reads_archaea.shhh.trim.unique.names.filter, group=454Reads_archaea.shhh.groups.filter, ##start=136, optimize=end, criteria=90, processors=2)<br />
#The optimize and criteria parameters allow you set the start, end, maxabig, maxhomop, minlength and maxlength parameters relative to your set of sequences .<br />
#For example optimize=start-end, criteria=90, would set the start and end values to the position 90% of your sequences started and ended.</p>
<p>mothur &gt; screen.seqs(fasta=454Reads_archaea.shhh.trim.unique.fasta.align, name=454Reads_archaea.shhh.trim.unique.names.filter, group=454Reads_archaea.shhh.groups.filter, optimize=start-end, criteria=90, processors=4)<br />
&#8230;<br />
Output File Names:<br />
454Reads_archaea.shhh.trim.unique.fasta.good.align<br />
454Reads_archaea.shhh.trim.unique.fasta.bad.accnos<br />
454Reads_archaea.shhh.trim.unique.names.good.filter<br />
454Reads_archaea.shhh.groups.good.filter<br />
It took 4 secs to screen 10014 sequences.</p>
<p>mothur &gt; summary.seqs(fasta=454Reads_archaea.shhh.trim.unique.fasta.good.align, name=454Reads_archaea.shhh.trim.unique.names.good.filter)</p>
<p>Using 4 processors.</p>
<p>Start End NBases Ambigs Polymer NumSeqs<br />
Minimum: 107 1819 243 0 3 1<br />
2.5%-tile: 133 1821 263 0 3 925<br />
25%-tile: 136 1831 269 0 4 9242<br />
Median: 136 1836 274 0 4 18484<br />
75%-tile: 136 1853 283 0 4 27725<br />
97.5%-tile: 136 1871 298 0 5 36042<br />
Maximum: 136 1920 313 0 8 36966<br />
Mean: 135.731 1840.24 276.401 0 4.14224<br />
# of unique seqs: 7703<br />
total # of seqs: 36966</p>
<p>Output File Name:<br />
454Reads_archaea.shhh.trim.unique.fasta.good.summary</p>
<p>&nbsp;</p>
<p>mothur &gt; filter.seqs(fasta=454Reads_archaea.shhh.trim.unique.fasta.good.align, vertical=T, trump=., processors=2)<br />
3700<br />
3800<br />
3851</p>
<p>Length of filtered alignment: 486<br />
Number of columns removed: 7196<br />
Length of the original alignment: 7682<br />
Number of sequences used to construct filter: 7703</p>
<p>Output File Names:<br />
454Reads_archaea.filter<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.fasta<br />
mothur &gt; unique.seqs(fasta=454Reads_archaea.shhh.trim.unique.fasta.good.filter.fasta, name=454Reads_archaea.shhh.trim.unique.names.good.filter)</p>
<p>1000 974<br />
2000 1887<br />
3000 2768<br />
4000 3604<br />
5000 4424<br />
6000 5238<br />
7000 6017<br />
7703 6573</p>
<p>Output File Names:<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.fasta<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.names</p>
<p>mothur &gt; shhh.seqs(fasta=454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.fasta, name=454Reads_archaea.shhh.trim.unique.fasta.good.filter.names, group=454Reads_archaea.shhh.groups.good.filter, processors=3)<br />
Output File Names:<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.unique.fasta<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.unique.names</p>
<p>/******************************************/</p>
<p>Output File Names:<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh.Eric_ARC.map<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh.GZ1122_ARC.map<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh.GZ1122cellulose_ARC.map<br />
&#8230;&#8230;.</p>
<p>mothur &gt; summary.seqs(fasta=454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.fasta, name=454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.names)<br />
Using 2 processors.</p>
<p>Start End NBases Ambigs Polymer NumSeqs<br />
Minimum: 1 484 219 0 3 1<br />
2.5%-tile: 1 486 260 0 3 925<br />
25%-tile: 1 486 261 0 4 9242<br />
Median: 1 486 261 0 4 18484<br />
75%-tile: 1 486 266 0 4 27725<br />
97.5%-tile: 1 486 266 0 5 36042<br />
Maximum: 3 486 282 0 7 36966<br />
Mean: 1.00103 486 262.434 0 4.1387<br />
# of unique seqs: 2911<br />
total # of seqs: 36966</p>
<p>Output File Name:<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.summary</p>
<p>&nbsp;</p>
<p>mothur &gt; chimera.uchime(fasta=454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.fasta, name=454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.names, group=454Reads_archaea.shhh.groups.good.filter, processors=3)<br />
It took 0 secs to check 46 sequences from group YL_xylan_ARC.</p>
<p>It took 43 secs to check 3276 sequences. 362 chimeras were found.<br />
The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples.</p>
<p>Output File Names:<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.chimeras<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos<br />
##############################3<br />
shenzy@shenzy-ubuntu:/winxp_disk2/shenzy/xiaoying_work_archaea/16s_archaea_allsamples/shhh_pipe2$ mv 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos.self<br />
shenzy@shenzy-ubuntu:/winxp_disk2/shenzy/xiaoying_work_archaea/16s_archaea_allsamples/shhh_pipe2$ mv 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.chimeras 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.chimeras.self<br />
################################</p>
<p>mothur &gt;chimera.uchime(fasta=454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.fasta, reference=core_set_aligned.fasta.imputed, processors=3)<br />
05:04 26Mb 100.0% 30/969 chimeras found (3.1%)<br />
05:11 26Mb 100.0% 88/970 chimeras found (9.1%)</p>
<p>It took 311 secs to check 2911 sequences. 213 chimeras were found.</p>
<p>Output File Names:<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.chimeras<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos</p>
<p>###################################<br />
shenzy@shenzy-ubuntu:/winxp_disk2/shenzy/xiaoying_work_archaea/16s_archaea_allsamples/shhh_pipe2$ wc 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos<br />
213 213 3195 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos<br />
shenzy@shenzy-ubuntu:/winxp_disk2/shenzy/xiaoying_work_archaea/16s_archaea_allsamples/shhh_pipe2$ wc 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos.self<br />
362 362 5430 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos.self<br />
cat 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos.self &gt; 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos.sum</p>
<p>sort 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos.sum &gt; 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos.sum.sort</p>
<p>merge 2 predict results of chimera and del repeat!<br />
shenzy@shenzy-ubuntu:/winxp_disk2/shenzy/xiaoying_work_archaea/16s_archaea_allsamples/shhh_pipe2$ sed &#8216;$!N; /^\(.*\)\n\1$/!P; D&#8217; 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos.sum.sort &gt; 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos.sum.sort.uniq<br />
shenzy@shenzy-ubuntu:/winxp_disk2/shenzy/xiaoying_work_archaea/16s_archaea_allsamples/shhh_pipe2$ wc 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos.sum.sort.uniq<br />
382 382 5730 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos.sum.sort.uniq<br />
####################################</p>
<p>get_fasta_from_seqname.py -i 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos.sum.sort.uniq -j 454Reads_archaea.fasta &gt; 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos.sum.sort.uniq.fasta</p>
<p>chimera seqs RDP checking (http://rdp.cme.msu.edu/classifier/classifier.jsp)<br />
Check the last genus id percent, if percent &gt;=90%, (keep it and merge it to the non-chimera reads of each sample)<br />
shenzy@shenzy-ubuntu:/winxp_disk2/shenzy/xiaoying_work_archaea/16s_archaea_allsamples/shhh_pipe2$ more allrank_454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos.sum.sort.uniq.fasta_classified.txt<br />
HQ93PQ301A0ZHW;;Root;100%;Archaea;100%;&#8221;Euryarchaeota&#8221;;100%;&#8221;Methanomicrobia&#8221;;100%;Methanomicrobiales;100%;Methanospirillaceae;100%;Methanospirillum;100%<br />
HQ93PQ301A1JYN;;Root;100%;Archaea;100%;&#8221;Euryarchaeota&#8221;;100%;&#8221;Methanomicrobia&#8221;;100%;Methanosarcinales;100%;Methanosarcinaceae;100%;Methanosarcina;100%<br />
shenzy@shenzy-ubuntu:/winxp_disk2/shenzy/xiaoying_work_archaea/16s_archaea_allsamples/shhh_pipe2$ check_real_chimera_seq.py -i allrank_454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.uchime.accnos.sum.sort.uniq.fasta_classified.txt -d 90 | wc<br />
221 221 3315</p>
<p>The 221 sequences should be merged to non-chimera results!!</p>
<p>-rwxrwxrwx 1 root root 2.4K 2012-08-14 11:19 chimera.seqs.name<br />
shenzy@shenzy-ubuntu:/winxp_disk2/shenzy/xiaoying_work_archaea/16s_archaea_allsamples/shhh_pipe2$ wc chimera.seqs.name<br />
161 161 2415 chimera.seqs.name</p>
<p>######################################################################<br />
Removing chimeras (the total predict chimera seqs by two approaches!)<br />
######################################################################<br />
mothur &gt; remove.seqs(accnos=chimera.seqs.name, fasta=454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.fasta, name=454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.names, group=454Reads_archaea.shhh.groups.good.filter)</p>
<p>Removed 1197 sequences from your name file.<br />
Removed 161 sequences from your fasta file.<br />
Removed 1197 sequences from your group file.</p>
<p>Output File Names:<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.names<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.fasta<br />
454Reads_archaea.shhh.groups.good.pick.filter<br />
mothur &gt; summary.seqs(name=current)</p>
<p>Using 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.names as input file for the name parameter.<br />
Using 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.fasta as input file for the fasta parameter.</p>
<p>Using 3 processors.</p>
<p>Start End NBases Ambigs Polymer NumSeqs<br />
Minimum: 1 484 219 0 3 1<br />
2.5%-tile: 1 486 260 0 3 895<br />
25%-tile: 1 486 261 0 4 8943<br />
Median: 1 486 261 0 4 17885<br />
75%-tile: 1 486 266 0 4 26827<br />
97.5%-tile: 1 486 266 0 5 34875<br />
Maximum: 3 486 278 0 7 35769<br />
Mean: 1.00106 486 262.504 0 4.16914<br />
# of unique seqs: 2750<br />
total # of seqs: 35769</p>
<p>Output File Name:<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.summary<br />
#############################<br />
chimera number<br />
##############################<br />
./compute_chimera_for_singlesample.py -i 454Reads_Bacteria.Eric_BAC.shhh.groups -j chimera.seqs.name</p>
<p>GZ1122_ARC: 10<br />
GZ1122cellulose_ARC: 3<br />
GZ_ARC: 1<br />
GZ_cellulose55_ARC: 1<br />
GZ_xylan_ARC: 7<br />
GZ_xylose_ARC: 4<br />
SHX1125cellulose_ARC: 0<br />
SHX1219_ARC: 0<br />
SHX_ARC: 5<br />
SHX_cellulose55_ARC: 8<br />
SHX_xylan_ARC: 3<br />
SHX_xylose_ARC: 35<br />
ST0303cellulose_ARC: 15<br />
ST1219_ARC: 5<br />
ST_ARC: 21<br />
ST_xylan_ARC: 11<br />
TP_1201_ARC: 0<br />
TP_xylan_ARC: 19<br />
YL0203cellulose_ARC: 7<br />
YL_ARC: 2<br />
YL_xylan_ARC: 3</p>
<p>&nbsp;</p>
<p>#######################<br />
Removing &#8220;contaminants&#8221;<br />
#######################<br />
wget http://www.mothur.org/w/images/5/59/Trainset9_032012.pds.zip<br />
shenzy@shenzy-ubuntu:/winxp_disk2/shenzy/xiaoying_work_archaea/16s_archaea_allsamples/shhh_pipe2$ unzip Trainset9_032012.pds.zip<br />
Archive: Trainset9_032012.pds.zip<br />
inflating: trainset9_032012.pds.tax<br />
inflating: trainset9_032012.pds.fasta</p>
<p>mothur &gt; classify.seqs(fasta=454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.fasta, name=454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.names, group=454Reads_archaea.shhh.groups.good.pick.filter, template=trainset9_032012.pds.fasta, taxonomy=trainset9_032012.pds.tax, cutoff=80, processors=2)<br />
&#8230;.<br />
Processing sequence: 1300<br />
Processing sequence: 1300<br />
[WARNING]: HQ93PQ301C4CTV could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences.<br />
[WARNING]: HQ93PQ301CK6YP could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences.<br />
[WARNING]: HQ93PQ301DJMTI could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences.<br />
[WARNING]: HQ93PQ301ERRC6 could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences.<br />
Processing sequence: 1372<br />
Processing sequence: 1371<br />
It took 25 secs to classify 2750 sequences.</p>
<p>Reading 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.names&#8230; Done.</p>
<p>It took 3 secs to create the summary file for 2750 sequences.<br />
Output File Names:<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.pds.taxonomy<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.pds.flip.accnos<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.pds.tax.summary</p>
<p>&nbsp;</p>
<p>mothur &gt; remove.lineage(fasta=454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.fasta, name=454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.names, group=454Reads_archaea.shhh.groups.good.pick.filter, taxonomy=454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.pds.taxonomy, taxon=Mitochondria-Cyanobacteria_Chloroplast-Eukarya-Bacteria-unknown)<br />
Output File Names:<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.pds.pick.taxonomy<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.pick.names<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.pick.fasta<br />
454Reads_archaea.shhh.groups.good.pick.pick.filter<br />
mothur &gt; summary.seqs(name=current)</p>
<p>Using 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.pick.names as input file for the name parameter.<br />
Using 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.pick.fasta as input file for the fasta parameter.</p>
<p>Using 2 processors.</p>
<p>Start End NBases Ambigs Polymer NumSeqs<br />
Minimum: 1 484 219 0 3 1<br />
2.5%-tile: 1 486 260 0 3 852<br />
25%-tile: 1 486 261 0 4 8519<br />
Median: 1 486 261 0 4 17037<br />
75%-tile: 1 486 266 0 4 25555<br />
97.5%-tile: 1 486 266 0 5 33221<br />
Maximum: 1 486 278 0 7 34072<br />
Mean: 1 486 262.598 0 4.21437<br />
# of unique seqs: 2644<br />
total # of seqs: 34072</p>
<p>Output File Name:<br />
454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.pick.summary</p>
<p>############################################################################################<br />
mothur &gt; system(cp 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.pds.pick.taxonomy archaea_16s_final.taxonomy)<br />
mothur &gt; system(cp 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.pick.names archaea_16s_final.names)<br />
mothur &gt; system(cp 454Reads_archaea.shhh.trim.unique.fasta.good.filter.unique.shhh_seqs.pick.pick.fasta archaea_16s_final.fasta)<br />
mothur &gt; system(cp 454Reads_archaea.shhh.groups.good.pick.pick.filter archaea_16s_final.groups)<br />
mothur &gt; dist.seqs(fasta=archaea_16s_final.fasta, cutoff=0.1, processors=3)</p>
<p>Output File Name:<br />
archaea_16s_final.dist</p>
<p>It took 25 to calculate the distances for 2644 sequences.</p>
<p>mothur &gt; cluster(column=archaea_16s_final.dist, name=archaea_16s_final.names)<br />
changed cutoff to 0.0392497</p>
<p>Output File Names:<br />
archaea_16s_final.an.sabund<br />
archaea_16s_final.an.rabund<br />
archaea_16s_final.an.list</p>
<p>It took 9 seconds to cluster</p>
<p>mothur &gt; make.shared(list=archaea_16s_final.an.list, group=archaea_16s_final.groups)</p>
<p>unique<br />
0.01<br />
0.02<br />
0.03</p>
<p>Output File Names:<br />
archaea_16s_final.an.shared<br />
archaea_16s_final.an.Eric_ARC.rabund<br />
archaea_16s_final.an.GZ1122_ARC.rabund<br />
archaea_16s_final.an.GZ1122cellulose_ARC.rabund<br />
archaea_16s_final.an.GZ_ARC.rabund<br />
archaea_16s_final.an.GZ_cellulose55_ARC.rabund<br />
archaea_16s_final.an.GZ_xylan_ARC.rabund<br />
archaea_16s_final.an.GZ_xylose_ARC.rabund<br />
archaea_16s_final.an.Milk_D_ARC.rabund<br />
archaea_16s_final.an.Milk_E_ARC.rabund<br />
archaea_16s_final.an.SHX1125cellulose_ARC.rabund<br />
archaea_16s_final.an.SHX1219_ARC.rabund<br />
archaea_16s_final.an.SHX_ARC.rabund<br />
archaea_16s_final.an.SHX_cellulose55_ARC.rabund<br />
archaea_16s_final.an.SHX_xylan_ARC.rabund<br />
archaea_16s_final.an.SHX_xylose_ARC.rabund<br />
archaea_16s_final.an.ST0303cellulose_ARC.rabund<br />
archaea_16s_final.an.ST1219_ARC.rabund<br />
archaea_16s_final.an.ST_ARC.rabund<br />
archaea_16s_final.an.ST_xylan_ARC.rabund<br />
archaea_16s_final.an.TP_1201_ARC.rabund<br />
archaea_16s_final.an.TP_xylan_ARC.rabund<br />
archaea_16s_final.an.YL0203cellulose_ARC.rabund<br />
archaea_16s_final.an.YL_ARC.rabund<br />
archaea_16s_final.an.YL_xylan_ARC.rabund<br />
mothur &gt; count.groups()</p>
<p>Using archaea_16s_final.an.shared as input file for the shared parameter.<br />
Eric_ARC contains 14.<br />
GZ1122_ARC contains 1780.<br />
GZ1122cellulose_ARC contains 1063.<br />
GZ_ARC contains 53.<br />
GZ_cellulose55_ARC contains 1997.<br />
GZ_xylan_ARC contains 1509.<br />
GZ_xylose_ARC contains 1241.<br />
Milk_D_ARC contains 19.<br />
Milk_E_ARC contains 434.<br />
SHX1125cellulose_ARC contains 2568.<br />
SHX1219_ARC contains 2012.<br />
SHX_ARC contains 1594.<br />
SHX_cellulose55_ARC contains 2235.<br />
SHX_xylan_ARC contains 944.<br />
SHX_xylose_ARC contains 1932.<br />
ST0303cellulose_ARC contains 1815.<br />
ST1219_ARC contains 1597.<br />
ST_ARC contains 774.<br />
ST_xylan_ARC contains 1755.<br />
TP_1201_ARC contains 1952.<br />
TP_xylan_ARC contains 1849.<br />
YL0203cellulose_ARC contains 1762.<br />
YL_ARC contains 1154.<br />
YL_xylan_ARC contains 2019.<br />
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^old^^^^^^^^^^^^^<br />
Using archaea_16s_final.an.shared as input file for the shared parameter.<br />
Eric_ARC contains 15.***********<br />
GZ1122_ARC contains 1815.<br />
GZ1122cellulose_ARC contains 1081.<br />
GZ_ARC contains 54.<br />
GZ_cellulose55_ARC contains 1997.<br />
GZ_xylan_ARC contains 1547.<br />
GZ_xylose_ARC contains 1245.<br />
Milk_D_ARC contains 18. *********<br />
Milk_E_ARC contains 434. ********<br />
SHX1125cellulose_ARC contains 2570.<br />
SHX1219_ARC contains 2012.<br />
SHX_ARC contains 1593.<br />
SHX_cellulose55_ARC contains 2236.<br />
SHX_xylan_ARC contains 947.<br />
SHX_xylose_ARC contains 1932.<br />
ST0303cellulose_ARC contains 1810.<br />
ST1219_ARC contains 1597.<br />
ST_ARC contains 759.<br />
ST_xylan_ARC contains 1755.<br />
TP_1201_ARC contains 1952.<br />
TP_xylan_ARC contains 1849.<br />
YL0203cellulose_ARC contains 1762.<br />
YL_ARC contains 1164.<br />
YL_xylan_ARC contains 2019.</p>
<p>&nbsp;</p>
<p>mothur &gt; count.groups()</p>
<p>Using archaea_16s_final.an.shared as input file for the shared parameter.<br />
Eric_ARC contains 569.<br />
GZ1122_ARC contains 2103.<br />
GZ1122cellulose_ARC contains 1594.<br />
GZ_ARC contains 530.<br />
GZ_cellulose55_ARC contains 2001.<br />
GZ_xylan_ARC contains 1889.<br />
GZ_xylose_ARC contains 2015.<br />
Milk_D_ARC contains 598.<br />
Milk_E_ARC contains 1753.<br />
SHX1125cellulose_ARC contains 2831.<br />
SHX1219_ARC contains 2247.<br />
SHX_ARC contains 1660.<br />
SHX_cellulose55_ARC contains 2249.<br />
SHX_xylan_ARC contains 1213.<br />
SHX_xylose_ARC contains 1991.<br />
ST0303cellulose_ARC contains 1845.<br />
ST1219_ARC contains 1621.<br />
ST_ARC contains 1859.<br />
ST_xylan_ARC contains 1769.<br />
TP_1201_ARC contains 1969.<br />
TP_xylan_ARC contains 1890.<br />
YL0203cellulose_ARC contains 1785.<br />
YL_ARC contains 1285.<br />
YL_xylan_ARC contains 2025.</p>
<p>&nbsp;<br />
mothur &gt; sub.sample(shared=archaea_16s_final.an.shared, size=759)</p>
<p>Eric_ARC contains 15. Eliminating.<br />
GZ_ARC contains 54. Eliminating.<br />
Milk_D_ARC contains 18. Eliminating.<br />
Milk_E_ARC contains 434. Eliminating.<br />
Sampling 759 from each group.<br />
unique<br />
0.01<br />
0.02<br />
0.03</p>
<p>Output File Names:<br />
archaea_16s_final.an.uniquesubsample.shared<br />
archaea_16s_final.an.0.01subsample.shared<br />
archaea_16s_final.an.0.02subsample.shared<br />
archaea_16s_final.an.0.03subsample.shared<br />
mothur &gt; classify.otu(list=archaea_16s_final.an.list, name=archaea_16s_final.names, taxonomy=archaea_16s_final.taxonomy)</p>
<p>reftaxonomy is not required, but if given will keep the rankIDs in the summary file static.<br />
unique 2636<br />
0.01 1940<br />
0.02 1033<br />
0.03 634</p>
<p>Output File Names:<br />
archaea_16s_final.an.uniquecons.taxonomy<br />
archaea_16s_final.an.uniquecons.tax.summary<br />
archaea_16s_final.an.0.01cons.taxonomy<br />
archaea_16s_final.an.0.01cons.tax.summary<br />
archaea_16s_final.an.0.02cons.taxonomy<br />
archaea_16s_final.an.0.02cons.tax.summary<br />
archaea_16s_final.an.0.03cons.taxonomy<br />
archaea_16s_final.an.0.03cons.tax.summary</p>
<p>&nbsp;</p>
]]></content:encoded>
			<wfw:commentRss>http://www.biofacebook.com/?feed=rss2&#038;p=494</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
		<item>
		<title>My PROJECT</title>
		<link>http://www.biofacebook.com/?p=212</link>
		<comments>http://www.biofacebook.com/?p=212#comments</comments>
		<pubDate>Wed, 30 May 2012 03:23:48 +0000</pubDate>
		<dc:creator><![CDATA[szypanther]]></dc:creator>
				<category><![CDATA[My project]]></category>
		<category><![CDATA[work]]></category>

		<guid isPermaLink="false">http://www.biofacebook.com/?p=212</guid>
		<description><![CDATA[<p>Amoa project</p> <p>A2,A3 amoa sample sequencing</p> <p>(1) use geneious to deal with the sequence raw data, get 635bp DNA sequence</p> <p>(2)use mafft to get the alignment results</p> <p>(3)mothur deal pipline</p> <p>mothur &#62; dist.seqs(fasta=AMOA_A2A3.mafft.align.shortname.fasta, output=lt)</p> <p>0 0 59 0</p> <p>Output File Name: AMOA_A2A3.mafft.align.shortname.phylip.dist</p> <p>It took 0 to calculate the distances for 60 sequences.</p> <p>mothur &#62; cluster(phylip=AMOA_A2A3.mafft.align.shortname.phylip.dist, [...]]]></description>
				<content:encoded><![CDATA[<p><strong>Amoa project</strong></p>
<p>A2,A3  amoa sample sequencing</p>
<p>(1) use geneious to deal with the sequence raw data,  get 635bp DNA sequence</p>
<p>(2)use mafft to get the alignment results</p>
<p>(3)mothur deal pipline</p>
<p>mothur &gt; dist.seqs(fasta=AMOA_A2A3.mafft.align.shortname.fasta, output=lt)</p>
<p>0 0<br />
59 0</p>
<p>Output File Name:<br />
AMOA_A2A3.mafft.align.shortname.phylip.dist</p>
<p>It took 0 to calculate the distances for 60 sequences.</p>
<p>mothur &gt; cluster(phylip=AMOA_A2A3.mafft.align.shortname.phylip.dist, cutoff=0.05)</p>
<p>********************#****#****#****#****#****#****#****#****#****#****#<br />
Reading matrix: ||||||||||||||||||||||||||||||||||||||||||||||||||||<br />
***********************************************************************<br />
unique 4 49 2 1 1<br />
0.01 11 22 4 3 0 2 0 0 0 0 0 1<br />
0.02 16 15 4 2 0 0 0 0 0 0 0 0 0 0 0 1 1<br />
0.03 36 10 3 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 00 0 0 0 0 0 0 0 0 0 0 0 0 0 1<br />
0.04 38 9 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 00 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1<br />
changed cutoff to 0.0352006</p>
<p>Output File Names:<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.sabund<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.rabund<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.list</p>
<p>It took 0 seconds to cluster</p>
<p>mothur &gt; unique.seqs(fasta=AMOA_A2A3.mafft.align.shortname.fasta)</p>
<p>60 53</p>
<p>Output File Names:<br />
AMOA_A2A3.mafft.align.shortname.unique.fasta<br />
AMOA_A2A3.mafft.align.shortname.names</p>
<p>mothur &gt; parse.list(list=AMOA_A2A3.mafft.align.shortname.phylip.an.list, group=AMOA_A2A3.mafft.align.shortname.group)</p>
<p>unique<br />
0.01<br />
0.02<br />
0.03<br />
0.04</p>
<p>Output File Names:<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A2.list<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A3.list</p>
<p>mothur &gt; rarefaction.single(list=AMOA_A2A3.mafft.align.shortname.phylip.an.A3.list, iters=10000, freq=0.10, calc=ace-sobs-chao, processors=3)</p>
<p>unique<br />
0.01<br />
0.02<br />
0.03<br />
0.04</p>
<p>Output File Names:<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A3.r_ace<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A3.rarefaction<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A3.r_chao<br />
mothur &gt; rarefaction.single(list=AMOA_A2A3.mafft.align.shortname.phylip.an.A2.list, iters=10000, freq=0.10, calc=ace-sobs-chao, processors=3)</p>
<p>unique<br />
0.01<br />
0.02<br />
0.03<br />
0.04</p>
<p>Output File Names:<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A2.r_ace<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A2.rarefaction<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A2.r_chao</p>
<p>mothur &gt; rarefaction.single(list=AMOA_A2A3.mafft.align.shortname.phylip.an.A3.list, iters=10000, freq=0.10, calc=shannon, processors=3)</p>
<p>unique<br />
0.01<br />
0.02<br />
0.03<br />
0.04</p>
<p>Output File Names:<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A3.r_shannon<br />
mothur &gt; rarefaction.single(list=AMOA_A2A3.mafft.align.shortname.phylip.an.A2.list, iters=10000, freq=0.10, calc=shannon, processors=3)</p>
<p>unique<br />
0.01<br />
0.02<br />
0.03<br />
0.04</p>
<p>Output File Names:<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A2.r_shannon</p>
<p>&nbsp;</p>
<p>mothur &gt; get.oturep(phylip=AMOA_A2A3.mafft.align.shortname.phylip.dist, fasta=AMOA_A2A3.mafft.align.shortname.fasta, list=AMOA_A2A3.mafft.align.shortname.phylip.an.A2.list, group=AMOA_A2A3.mafft.align.shortname.group)</p>
<p>********************#****#****#****#****#****#****#****#****#****#****#<br />
Reading matrix: ||||||||||||||||||||||||||||||||||||||||||||||||||||<br />
***********************************************************************<br />
unique 27<br />
0.01 17<br />
0.02 12<br />
0.03 7<br />
0.04 6</p>
<p>Output File Names:<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A2.unique.rep.names<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A2.0.01.rep.names<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A2.0.02.rep.names<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A2.0.03.rep.names<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A2.0.04.rep.names<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A2.0.01.rep.fasta<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A2.0.02.rep.fasta<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A2.0.03.rep.fasta<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A2.0.04.rep.fasta<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A2.unique.rep.fasta<br />
mothur &gt; get.oturep(phylip=AMOA_A2A3.mafft.align.shortname.phylip.dist, fasta=AMOA_A2A3.mafft.align.shortname.fasta, list=AMOA_A2A3.mafft.align.shortname.phylip.an.A3.list, group=AMOA_A2A3.mafft.align.shortname.group)</p>
<p>********************#****#****#****#****#****#****#****#****#****#****#<br />
Reading matrix: ||||||||||||||||||||||||||||||||||||||||||||||||||||<br />
***********************************************************************<br />
unique 29<br />
0.01 22<br />
0.02 16<br />
0.03 13<br />
0.04 11</p>
<p>Output File Names:<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A3.unique.rep.names<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A3.0.01.rep.names<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A3.0.02.rep.names<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A3.0.03.rep.names<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A3.0.04.rep.names<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A3.0.01.rep.fasta<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A3.0.02.rep.fasta<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A3.0.03.rep.fasta<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A3.0.04.rep.fasta<br />
AMOA_A2A3.mafft.align.shortname.phylip.an.A3.unique.rep.fasta</p>
<p>R studio 画图</p>
<pre>&gt; setwd("/home/shenzy/Desktop/amoA_work")
&gt; data0&lt;-read.table(file="AMOA_A2A3.mafft.align.shortname.phylip.an.A2.rarefaction",header=T)
&gt; data1&lt;-read.table(file="AMOA_A2A3.mafft.align.shortname.phylip.an.A3.rarefaction",header=T)
&gt; plot(x=data0$numsampled, y=data0$X0.03, xlab="Number of clones analyzed",ylab="Number of OTUs observed", type="l", col="green", xlim=c(0,30),ylim=c(0,15))
&gt; points(x=data1$numsampled, y=data1$X0.03, type="l", col="blue")
&gt; legend(x=2, y=14, c("A2","A3"),c("green","blue")</pre>
<pre><a href="http://www.biofacebook.com/wp-content/uploads/2012/05/Rplot.png"><img class="alignleft size-full wp-image-217" title="Rplot" src="http://www.biofacebook.com/wp-content/uploads/2012/05/Rplot.png" alt="" width="750" height="400" /></a></pre>
<p>&nbsp;</p>
<p>&nbsp;</p>
]]></content:encoded>
			<wfw:commentRss>http://www.biofacebook.com/?feed=rss2&#038;p=212</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
	</channel>
</rss>
