awslabs · mohdaliiqbal · May 13, 2020 · May 13, 2020 · May 13, 2020
diff --git a/README.md b/README.md
@@ -17,17 +17,39 @@ Install the spark distribution from the following location based on the glue ver
 Glue version 0.9: https://aws-glue-etl-artifacts.s3.amazonaws.com/glue-0.9/spark-2.2.1-bin-hadoop2.7.tgz
 Glue version 1.0: https://aws-glue-etl-artifacts.s3.amazonaws.com/glue-1.0/spark-2.4.3-bin-hadoop2.8.tgz
 
-Export SPARK_HOME environment variable to extracted location of the
+Export `SPARK_HOME` environment variable to extracted location of the
 above spark archive.
-Glue version 0.9: export SPARK_HOME=/home/$USER/spark-2.2.1-bin-hadoop2.7
-Glue version 1.0: export SPARK_HOME=/home/$USER/spark-2.4.3-bin-spark-2.4.3-bin-hadoop2.8
+
+Glue version 0.9: 
+```bash
+export SPARK_HOME=/home/$USER/spark-2.2.1-bin-hadoop2.7
+```
+
+Glue version 1.0: 
+```bash
+export SPARK_HOME=/home/$USER/spark-2.4.3-bin-spark-2.4.3-bin-hadoop2.8
+```
+
 
 The gluepytest script assumes that the pytest module is installed and available in the PATH
 
-Glue shell: ./bin/gluepyspark
-Glue submit: ./bin/gluesparksubmit
-pytest: ./bin/gluepytest
+Glue shell: `./bin/gluepyspark`
+
+Glue submit: `./bin/gluesparksubmit`
+
+pytest: `./bin/gluepytest`
+
+
+#### Windows platform instructions
+You must set HADOOP_HOME environment variable to your windows binaries distribution for hadoop. You can download windows binaries for hadoop from the following URL
+https://github.com/steveloughran/winutils
+
+Since above spark distribution is based on Hadoop 2.8 we can set `HADOOP_HOME` variable to `hadoop-2.8.3` directory of the above downloaded repository.
 
+
+```
+export HADOOP_HOME=/home/$USER/winutils/hadoop-2.8.3
+```
 ## Licensing
 
 The libraries in this repository licensed under the [Amazon Software License](http://aws.amazon.com/asl/) (the "License"). They may not be used except in compliance with the License, a copy of which is included here in the LICENSE file.
diff --git a/bin/gluepyspark.cmd b/bin/gluepyspark.cmd
@@ -0,0 +1,39 @@
+REM The code assumes you have JAVA_HOME and HADOOP_HOME variables are set according to the instructions in the README
+echo off
+SET ORIGINAL_PYTHON_PATH=%PYTHONPATH%
+
+FOR /F "tokens=* USEBACKQ" %%F IN (`cd`) DO (
+SET ROOT_DIR=%%F
+)
+
+
+SET SPARK_CONF_DIR=%ROOT_DIR%\conf
+SET GLUE_JARS_DIR=%ROOT_DIR%/jarsv1
+
+SET PYTHONPATH=%SPARK_HOME%\python\;%PYTHONPATH%
+for %%x in (%SPARK_HOME%\python\lib\py4j-*-src.zip) do (
+    SET PYTHONPATH=%PYTHONPATH%;%%x
+)
+
+REM Generate the zip archive for glue python modules
+REM If you did not have powershell available then replace the following step with static zip file
+del PyGlue.zip
+powershell Compress-Archive awsglue PyGlue.zip
+REM Asssuming PyGlue.zip file is available
+
+SET GLUE_PY_FILES=%ROOT_DIR%\PyGlue.zip
+SET PYTHONPATH=%GLUE_PY_FILES%;%PYTHONPATH%
+
+REM Run mvn copy-dependencies target to get the Glue dependencies locally
+SET RETURN = mvn -f %ROOT_DIR%/pom.xml -DoutputDirectory=%ROOT_DIR%/jarsv1 dependency:copy-dependencies
+SET SPARK_CONF_DIR=%ROOT_DIR%\conf
+mkdir %SPARK_CONF_DIR%
+del %SPARK_CONF_DIR%\spark-defaults.conf
+REM Generate spark-defaults.conf
+SET GLUE_JARS_DIR=%GLUE_JARS_DIR:\=/%
+echo spark.driver.extraClassPath    %GLUE_JARS_DIR%/* >> %SPARK_CONF_DIR%\spark-defaults.conf
+echo spark.executor.extraClassPath  %GLUE_JARS_DIR%/* >> %SPARK_CONF_DIR%\spark-defaults.conf
+
+REM calling the pyspark and forwarding the arguments
+%SPARK_HOME%\bin\pyspark %*
+SET PYTHONPATH=%ORIGINAL_PYTHON_PATH%
diff --git a/bin/gluesparksubmit.cmd b/bin/gluesparksubmit.cmd
@@ -0,0 +1,41 @@
+REM The code assumes you have JAVA_HOME and HADOOP_HOME variables are set according to the instructions in the README
+echo off
+SET ORIGINAL_PYTHON_PATH=%PYTHONPATH%
+
+FOR /F "tokens=* USEBACKQ" %%F IN (`cd`) DO (
+SET ROOT_DIR=%%F
+)
+
+
+SET SPARK_CONF_DIR=%ROOT_DIR%\conf
+SET GLUE_JARS_DIR=%ROOT_DIR%/jarsv1
+
+SET PYTHONPATH=%SPARK_HOME%\python\;%PYTHONPATH%
+for %%x in (%SPARK_HOME%\python\lib\py4j-*-src.zip) do (
+    SET PYTHONPATH=%PYTHONPATH%;%%x
+)
+
+REM Generate the zip archive for glue python modules
+REM If you did not have powershell available then replace the following step with static zip file
+del PyGlue.zip
+powershell Compress-Archive awsglue PyGlue.zip
+REM Asssuming PyGlue.zip file is available
+
+SET GLUE_PY_FILES=%ROOT_DIR%\PyGlue.zip
+SET PYTHONPATH=%GLUE_PY_FILES%;%PYTHONPATH%
+
+REM Run mvn copy-dependencies target to get the Glue dependencies locally
+SET RETURN = mvn -f %ROOT_DIR%/pom.xml -DoutputDirectory=%ROOT_DIR%/jarsv1 dependency:copy-dependencies
+SET SPARK_CONF_DIR=%ROOT_DIR%\conf
+mkdir %SPARK_CONF_DIR%
+del %SPARK_CONF_DIR%\spark-defaults.conf
+REM Generate spark-defaults.conf
+SET GLUE_JARS_DIR=%GLUE_JARS_DIR:\=/%
+echo spark.driver.extraClassPath    %GLUE_JARS_DIR%/* >> %SPARK_CONF_DIR%\spark-defaults.conf
+echo spark.executor.extraClassPath  %GLUE_JARS_DIR%/* >> %SPARK_CONF_DIR%\spark-defaults.conf
+
+REM calling the pyspark and forwarding the arguments
+
+%SPARK_HOME%\bin\spark-submit --py-files %GLUE_PY_FILES% %*
+SET PYTHONPATH=%ORIGINAL_PYTHON_PATH%
+