@echo off
@rem
@rem Licensed to the Apache Software Foundation (ASF) under one or more
@rem contributor license agreements. See the NOTICE file distributed with
@rem this work for additional information regarding copyright ownership.
@rem The ASF licenses this file to You under the Apache License, Version 2.0
@rem (the "License"); you may not use this file except in compliance with
@rem the License. You may obtain a copy of the License at
@rem
@rem http://www.apache.org/licenses/LICENSE-2.0
@rem
@rem Unless required by applicable law or agreed to in writing, software
@rem distributed under the License is distributed on an "AS IS" BASIS,
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@rem See the License for the specific language governing permissions and
@rem limitations under the License.
@rem
@rem
@rem Downloads the 20newsgroups dataset, trains and tests a bayes classifier.
@rem
@rem To run: change into the mahout directory and type:
@rem examples/bin/build-20news.sh
setlocal enabledelayedexpansion
set SCRIPT_PATH=%~dp0
set MAHOUT_BIN_PATH=%SCRIPT_PATH%..\..\bin\
set WORK_PATH=%SCRIPT_PATH%work\20news-bydate\
set HDFS_WORK_PATH=examples/bin/work/20news-bydate/
if not exist %WORK_PATH% (
mkdir %WORK_PATH%
)
if not exist %WORK_PATH%20news-bydate-train (
goto :ErrorMessage
)
if not exist %WORK_PATH%20news-bydate-test (
goto :ErrorMessage
)
pushd %MAHOUT_BIN_PATH%
REM call mahout.cmd org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups ^
REM -p %WORK_PATH%\20news-bydate-train ^
REM -o %WORK_PATH%\bayes-train-input ^
REM -a org.apache.mahout.vectorizer.DefaultAnalyzer ^
REM -c UTF-8
REM call mahout.cmd org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups ^
REM -p %WORK_PATH%\20news-bydate-test ^
REM -o %WORK_PATH%\bayes-test-input ^
REM -a org.apache.mahout.vectorizer.DefaultAnalyzer ^
REM -c UTF-8
call %HADOOP_HOME%\bin\hadoop jar %MAHOUT_BIN_PATH%..\mahout-examples-0.5-job.jar ^
org.apache.mahout.driver.MahoutDriver ^
org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups ^
-p %WORK_PATH%20news-bydate-train ^
-o %WORK_PATH%bayes-train-input ^
-a org.apache.mahout.vectorizer.DefaultAnalyzer ^
-c UTF-8
call %HADOOP_HOME%\bin\hadoop jar %MAHOUT_BIN_PATH%..\mahout-examples-0.5-job.jar ^
org.apache.mahout.driver.MahoutDriver ^
org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups ^
-p %WORK_PATH%20news-bydate-test ^
-o %WORK_PATH%bayes-test-input ^
-a org.apache.mahout.vectorizer.DefaultAnalyzer ^
-c UTF-8
@rem mapreduce test method used on hadoop
set TEST_METHOD="mapreduce"
call hadoop.cmd dfs -rmr ^
%HDFS_WORK_PATH%bayes-train-input
call hadoop.cmd dfs -rmr ^
%HDFS_WORK_PATH%bayes-test-input
call hadoop.cmd dfs -put ^
%WORK_PATH%\bayes-train-input ^
%HDFS_WORK_PATH%bayes-train-input
call hadoop.cmd dfs -put ^
%WORK_PATH%\bayes-test-input ^
%HDFS_WORK_PATH%bayes-test-input
REM call mahout.cmd trainclassifier ^
REM -i %HDFS_WORK_PATH%bayes-train-input ^
REM -o %HDFS_WORK_PATH%bayes-model ^
REM -type bayes ^
REM -ng 1 ^
REM -source hdfs
REM call mahout.cmd testclassifier ^
REM -m %HDFS_WORK_PATH%bayes-model ^
REM -d %HDFS_WORK_PATH%bayes-test-input ^
REM -type bayes ^
REM -ng 1 ^
REM -source hdfs ^
REM -method %TEST_METHOD%
call %HADOOP_HOME%\bin\hadoop jar %MAHOUT_BIN_PATH%..\mahout-examples-0.5-job.jar ^
org.apache.mahout.driver.MahoutDriver ^
trainclassifier ^
-i %HDFS_WORK_PATH%bayes-train-input ^
-o %HDFS_WORK_PATH%bayes-model ^
-type bayes ^
-ng 1 ^
-source hdfs
call %HADOOP_HOME%\bin\hadoop jar %MAHOUT_BIN_PATH%..\mahout-examples-0.5-job.jar ^
org.apache.mahout.driver.MahoutDriver ^
testclassifier ^
-m %HDFS_WORK_PATH%bayes-model ^
-d %HDFS_WORK_PATH%bayes-test-input ^
-type bayes ^
-ng 1 ^
-source hdfs ^
-method %TEST_METHOD%
popd
goto :eof
:ErrorMessage
echo Please download 20news-bydate.tar.gz from:
echo http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz
echo and extract it under:
echo %WORK_PATH%