Connection issue with AWS S3 from PySpark 2.3.1

classic Classic list List threaded Threaded
13 messages Options
Reply | Threaded
Open this post in threaded view
|

Connection issue with AWS S3 from PySpark 2.3.1

Aakash Basu-2
Hi,

I am trying to connect to AWS S3 and read a csv file (running POC) from a bucket.

I have s3cmd and and being able to run ls and other operation from cli.

Present Configuration:
Python 3.7
Spark 2.3.1

JARs added:
hadoop-aws-2.7.3.jar (in sync with the hadoop version used with spark)
aws-java-sdk-1.11.472.jar

Trying out the following code:

>>> sc=spark.sparkContext

>>> hadoop_conf=sc._jsc.hadoopConfiguration()

>>> hadoop_conf.set("fs.s3n.awsAccessKeyId", "abcd")

>>> hadoop_conf.set("fs.s3n.awsSecretAccessKey", "xyz123")

>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o33.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o67.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


Any help?

Thanks,
Aakash. 

 

Reply | Threaded
Open this post in threaded view
|

Re: Connection issue with AWS S3 from PySpark 2.3.1

Shuporno Choudhury


On Fri, 21 Dec 2018 at 12:47, Shuporno Choudhury <[hidden email]> wrote:
Hi,
Your connection config uses 's3n' but your read command uses 's3a'.
The config for s3a are:
spark.hadoop.fs.s3a.access.key
spark.hadoop.fs.s3a.secret.key

I feel this should solve the problem.

On Fri, 21 Dec 2018 at 12:09, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hi,

I am trying to connect to AWS S3 and read a csv file (running POC) from a bucket.

I have s3cmd and and being able to run ls and other operation from cli.

Present Configuration:
Python 3.7
Spark 2.3.1

JARs added:
hadoop-aws-2.7.3.jar (in sync with the hadoop version used with spark)
aws-java-sdk-1.11.472.jar

Trying out the following code:

>>> sc=spark.sparkContext

>>> hadoop_conf=sc._jsc.hadoopConfiguration()

>>> hadoop_conf.set("fs.s3n.awsAccessKeyId", "abcd")

>>> hadoop_conf.set("fs.s3n.awsSecretAccessKey", "xyz123")

>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o33.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o67.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


Any help?

Thanks,
Aakash. 

 




If you reply to this email, your message will be added to the discussion below:
http://apache-spark-user-list.1001560.n3.nabble.com/Connection-issue-with-AWS-S3-from-PySpark-2-3-1-tp34215.html
To start a new topic under Apache Spark User List, email [hidden email]
To unsubscribe from Apache Spark User List, click here.
NAML


--
--Thanks,
Shuporno Choudhury


--
--Thanks,
Shuporno Choudhury
Reply | Threaded
Open this post in threaded view
|

Re: Connection issue with AWS S3 from PySpark 2.3.1

Aakash Basu-2
Hey Shuporno,

Thanks for a prompt reply. Thanks for noticing the silly mistake, I tried this out, but still getting another error, which is related to connectivity it seems.

>>> hadoop_conf.set("fs.s3a.awsAccessKeyId", "abcd")
>>> hadoop_conf.set("fs.s3a.awsSecretAccessKey", "123abc")
>>> a = spark.read.csv("s3a:///test-bucket/breast-cancer-wisconsin.csv", header=True)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv
    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o220.csv.
: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 28 more


Thanks,
Aakash. 

On Fri, Dec 21, 2018 at 12:51 PM Shuporno Choudhury <[hidden email]> wrote:


On Fri, 21 Dec 2018 at 12:47, Shuporno Choudhury <[hidden email]> wrote:
Hi,
Your connection config uses 's3n' but your read command uses 's3a'.
The config for s3a are:
spark.hadoop.fs.s3a.access.key
spark.hadoop.fs.s3a.secret.key

I feel this should solve the problem.

On Fri, 21 Dec 2018 at 12:09, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hi,

I am trying to connect to AWS S3 and read a csv file (running POC) from a bucket.

I have s3cmd and and being able to run ls and other operation from cli.

Present Configuration:
Python 3.7
Spark 2.3.1

JARs added:
hadoop-aws-2.7.3.jar (in sync with the hadoop version used with spark)
aws-java-sdk-1.11.472.jar

Trying out the following code:

>>> sc=spark.sparkContext

>>> hadoop_conf=sc._jsc.hadoopConfiguration()

>>> hadoop_conf.set("fs.s3n.awsAccessKeyId", "abcd")

>>> hadoop_conf.set("fs.s3n.awsSecretAccessKey", "xyz123")

>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o33.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o67.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


Any help?

Thanks,
Aakash. 

 




If you reply to this email, your message will be added to the discussion below:
http://apache-spark-user-list.1001560.n3.nabble.com/Connection-issue-with-AWS-S3-from-PySpark-2-3-1-tp34215.html
To start a new topic under Apache Spark User List, email [hidden email]
To unsubscribe from Apache Spark User List, click here.
NAML


--
--Thanks,
Shuporno Choudhury


--
--Thanks,
Shuporno Choudhury
Reply | Threaded
Open this post in threaded view
|

Re: Connection issue with AWS S3 from PySpark 2.3.1

Shuporno Choudhury
Hi,
I don't know whether the following config (that you have tried) are correct:
fs.s3a.awsAccessKeyId
fs.s3a.awsSecretAccessKey

The correct ones probably are:
fs.s3a.access.key
fs.s3a.secret.key

On Fri, 21 Dec 2018 at 13:21, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hey Shuporno,

Thanks for a prompt reply. Thanks for noticing the silly mistake, I tried this out, but still getting another error, which is related to connectivity it seems.

>>> hadoop_conf.set("fs.s3a.awsAccessKeyId", "abcd")
>>> hadoop_conf.set("fs.s3a.awsSecretAccessKey", "123abc")
>>> a = spark.read.csv("s3a:///test-bucket/breast-cancer-wisconsin.csv", header=True)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv
    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o220.csv.
: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 28 more


Thanks,
Aakash. 

On Fri, Dec 21, 2018 at 12:51 PM Shuporno Choudhury <[hidden email]> wrote:


On Fri, 21 Dec 2018 at 12:47, Shuporno Choudhury <[hidden email]> wrote:
Hi,
Your connection config uses 's3n' but your read command uses 's3a'.
The config for s3a are:
spark.hadoop.fs.s3a.access.key
spark.hadoop.fs.s3a.secret.key

I feel this should solve the problem.

On Fri, 21 Dec 2018 at 12:09, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hi,

I am trying to connect to AWS S3 and read a csv file (running POC) from a bucket.

I have s3cmd and and being able to run ls and other operation from cli.

Present Configuration:
Python 3.7
Spark 2.3.1

JARs added:
hadoop-aws-2.7.3.jar (in sync with the hadoop version used with spark)
aws-java-sdk-1.11.472.jar

Trying out the following code:

>>> sc=spark.sparkContext

>>> hadoop_conf=sc._jsc.hadoopConfiguration()

>>> hadoop_conf.set("fs.s3n.awsAccessKeyId", "abcd")

>>> hadoop_conf.set("fs.s3n.awsSecretAccessKey", "xyz123")

>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o33.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o67.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


Any help?

Thanks,
Aakash. 

 




If you reply to this email, your message will be added to the discussion below:
http://apache-spark-user-list.1001560.n3.nabble.com/Connection-issue-with-AWS-S3-from-PySpark-2-3-1-tp34215.html
To start a new topic under Apache Spark User List, email [hidden email]
To unsubscribe from Apache Spark User List, click here.
NAML


--
--Thanks,
Shuporno Choudhury


--
--Thanks,
Shuporno Choudhury



If you reply to this email, your message will be added to the discussion below:
http://apache-spark-user-list.1001560.n3.nabble.com/Connection-issue-with-AWS-S3-from-PySpark-2-3-1-tp34215p34217.html
To start a new topic under Apache Spark User List, email [hidden email]
To unsubscribe from Apache Spark User List, click here.
NAML


--
--Thanks,
Shuporno Choudhury
Reply | Threaded
Open this post in threaded view
|

Re: Connection issue with AWS S3 from PySpark 2.3.1

Aakash Basu-2
Hey Shuporno,

With the updated config too, I am getting the same error. While trying to figure that out, I found this link which says I need aws-java-sdk (which I already have): https://github.com/amazon-archives/kinesis-storm-spout/issues/8

Now, this is my java details:

java version "1.8.0_181"

Java(TM) SE Runtime Environment (build 1.8.0_181-b13)

Java HotSpot(TM) 64-Bit Server VM (build 25.181-b13, mixed mode)



Is it due to some java version mismatch then or is it something else I am missing out? What do you think?


Thanks,
Aakash.

On Fri, Dec 21, 2018 at 1:43 PM Shuporno Choudhury <[hidden email]> wrote:
Hi,
I don't know whether the following config (that you have tried) are correct:
fs.s3a.awsAccessKeyId
fs.s3a.awsSecretAccessKey

The correct ones probably are:
fs.s3a.access.key
fs.s3a.secret.key

On Fri, 21 Dec 2018 at 13:21, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hey Shuporno,

Thanks for a prompt reply. Thanks for noticing the silly mistake, I tried this out, but still getting another error, which is related to connectivity it seems.

>>> hadoop_conf.set("fs.s3a.awsAccessKeyId", "abcd")
>>> hadoop_conf.set("fs.s3a.awsSecretAccessKey", "123abc")
>>> a = spark.read.csv("s3a:///test-bucket/breast-cancer-wisconsin.csv", header=True)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv
    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o220.csv.
: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 28 more


Thanks,
Aakash. 

On Fri, Dec 21, 2018 at 12:51 PM Shuporno Choudhury <[hidden email]> wrote:


On Fri, 21 Dec 2018 at 12:47, Shuporno Choudhury <[hidden email]> wrote:
Hi,
Your connection config uses 's3n' but your read command uses 's3a'.
The config for s3a are:
spark.hadoop.fs.s3a.access.key
spark.hadoop.fs.s3a.secret.key

I feel this should solve the problem.

On Fri, 21 Dec 2018 at 12:09, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hi,

I am trying to connect to AWS S3 and read a csv file (running POC) from a bucket.

I have s3cmd and and being able to run ls and other operation from cli.

Present Configuration:
Python 3.7
Spark 2.3.1

JARs added:
hadoop-aws-2.7.3.jar (in sync with the hadoop version used with spark)
aws-java-sdk-1.11.472.jar

Trying out the following code:

>>> sc=spark.sparkContext

>>> hadoop_conf=sc._jsc.hadoopConfiguration()

>>> hadoop_conf.set("fs.s3n.awsAccessKeyId", "abcd")

>>> hadoop_conf.set("fs.s3n.awsSecretAccessKey", "xyz123")

>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o33.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o67.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


Any help?

Thanks,
Aakash. 

 




If you reply to this email, your message will be added to the discussion below:
http://apache-spark-user-list.1001560.n3.nabble.com/Connection-issue-with-AWS-S3-from-PySpark-2-3-1-tp34215.html
To start a new topic under Apache Spark User List, email [hidden email]
To unsubscribe from Apache Spark User List, click here.
NAML


--
--Thanks,
Shuporno Choudhury


--
--Thanks,
Shuporno Choudhury



If you reply to this email, your message will be added to the discussion below:
http://apache-spark-user-list.1001560.n3.nabble.com/Connection-issue-with-AWS-S3-from-PySpark-2-3-1-tp34215p34217.html
To start a new topic under Apache Spark User List, email [hidden email]
To unsubscribe from Apache Spark User List, click here.
NAML


--
--Thanks,
Shuporno Choudhury
Reply | Threaded
Open this post in threaded view
|

Re: Connection issue with AWS S3 from PySpark 2.3.1

Aakash Basu-2
Any help, anyone?

On Fri, Dec 21, 2018 at 2:21 PM Aakash Basu <[hidden email]> wrote:
Hey Shuporno,

With the updated config too, I am getting the same error. While trying to figure that out, I found this link which says I need aws-java-sdk (which I already have): https://github.com/amazon-archives/kinesis-storm-spout/issues/8

Now, this is my java details:

java version "1.8.0_181"

Java(TM) SE Runtime Environment (build 1.8.0_181-b13)

Java HotSpot(TM) 64-Bit Server VM (build 25.181-b13, mixed mode)



Is it due to some java version mismatch then or is it something else I am missing out? What do you think?


Thanks,
Aakash.

On Fri, Dec 21, 2018 at 1:43 PM Shuporno Choudhury <[hidden email]> wrote:
Hi,
I don't know whether the following config (that you have tried) are correct:
fs.s3a.awsAccessKeyId
fs.s3a.awsSecretAccessKey

The correct ones probably are:
fs.s3a.access.key
fs.s3a.secret.key

On Fri, 21 Dec 2018 at 13:21, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hey Shuporno,

Thanks for a prompt reply. Thanks for noticing the silly mistake, I tried this out, but still getting another error, which is related to connectivity it seems.

>>> hadoop_conf.set("fs.s3a.awsAccessKeyId", "abcd")
>>> hadoop_conf.set("fs.s3a.awsSecretAccessKey", "123abc")
>>> a = spark.read.csv("s3a:///test-bucket/breast-cancer-wisconsin.csv", header=True)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv
    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o220.csv.
: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 28 more


Thanks,
Aakash. 

On Fri, Dec 21, 2018 at 12:51 PM Shuporno Choudhury <[hidden email]> wrote:


On Fri, 21 Dec 2018 at 12:47, Shuporno Choudhury <[hidden email]> wrote:
Hi,
Your connection config uses 's3n' but your read command uses 's3a'.
The config for s3a are:
spark.hadoop.fs.s3a.access.key
spark.hadoop.fs.s3a.secret.key

I feel this should solve the problem.

On Fri, 21 Dec 2018 at 12:09, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hi,

I am trying to connect to AWS S3 and read a csv file (running POC) from a bucket.

I have s3cmd and and being able to run ls and other operation from cli.

Present Configuration:
Python 3.7
Spark 2.3.1

JARs added:
hadoop-aws-2.7.3.jar (in sync with the hadoop version used with spark)
aws-java-sdk-1.11.472.jar

Trying out the following code:

>>> sc=spark.sparkContext

>>> hadoop_conf=sc._jsc.hadoopConfiguration()

>>> hadoop_conf.set("fs.s3n.awsAccessKeyId", "abcd")

>>> hadoop_conf.set("fs.s3n.awsSecretAccessKey", "xyz123")

>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o33.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o67.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


Any help?

Thanks,
Aakash. 

 




If you reply to this email, your message will be added to the discussion below:
http://apache-spark-user-list.1001560.n3.nabble.com/Connection-issue-with-AWS-S3-from-PySpark-2-3-1-tp34215.html
To start a new topic under Apache Spark User List, email [hidden email]
To unsubscribe from Apache Spark User List, click here.
NAML


--
--Thanks,
Shuporno Choudhury


--
--Thanks,
Shuporno Choudhury



If you reply to this email, your message will be added to the discussion below:
http://apache-spark-user-list.1001560.n3.nabble.com/Connection-issue-with-AWS-S3-from-PySpark-2-3-1-tp34215p34217.html
To start a new topic under Apache Spark User List, email [hidden email]
To unsubscribe from Apache Spark User List, click here.
NAML


--
--Thanks,
Shuporno Choudhury
Reply | Threaded
Open this post in threaded view
|

Re: Connection issue with AWS S3 from PySpark 2.3.1

Riccardo Ferrari
Hi Aakash,

Can you share how are you adding those jars? Are you using the package method ? I assume you're running in a cluster, and those dependencies might have not properly distributed.

How are you submitting your app? What kind of resource manager are you using standalone, yarn, ...

Best,

On Fri, Dec 21, 2018 at 1:18 PM Aakash Basu <[hidden email]> wrote:
Any help, anyone?

On Fri, Dec 21, 2018 at 2:21 PM Aakash Basu <[hidden email]> wrote:
Hey Shuporno,

With the updated config too, I am getting the same error. While trying to figure that out, I found this link which says I need aws-java-sdk (which I already have): https://github.com/amazon-archives/kinesis-storm-spout/issues/8

Now, this is my java details:

java version "1.8.0_181"

Java(TM) SE Runtime Environment (build 1.8.0_181-b13)

Java HotSpot(TM) 64-Bit Server VM (build 25.181-b13, mixed mode)



Is it due to some java version mismatch then or is it something else I am missing out? What do you think?


Thanks,
Aakash.

On Fri, Dec 21, 2018 at 1:43 PM Shuporno Choudhury <[hidden email]> wrote:
Hi,
I don't know whether the following config (that you have tried) are correct:
fs.s3a.awsAccessKeyId
fs.s3a.awsSecretAccessKey

The correct ones probably are:
fs.s3a.access.key
fs.s3a.secret.key

On Fri, 21 Dec 2018 at 13:21, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hey Shuporno,

Thanks for a prompt reply. Thanks for noticing the silly mistake, I tried this out, but still getting another error, which is related to connectivity it seems.

>>> hadoop_conf.set("fs.s3a.awsAccessKeyId", "abcd")
>>> hadoop_conf.set("fs.s3a.awsSecretAccessKey", "123abc")
>>> a = spark.read.csv("s3a:///test-bucket/breast-cancer-wisconsin.csv", header=True)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv
    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o220.csv.
: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 28 more


Thanks,
Aakash. 

On Fri, Dec 21, 2018 at 12:51 PM Shuporno Choudhury <[hidden email]> wrote:


On Fri, 21 Dec 2018 at 12:47, Shuporno Choudhury <[hidden email]> wrote:
Hi,
Your connection config uses 's3n' but your read command uses 's3a'.
The config for s3a are:
spark.hadoop.fs.s3a.access.key
spark.hadoop.fs.s3a.secret.key

I feel this should solve the problem.

On Fri, 21 Dec 2018 at 12:09, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hi,

I am trying to connect to AWS S3 and read a csv file (running POC) from a bucket.

I have s3cmd and and being able to run ls and other operation from cli.

Present Configuration:
Python 3.7
Spark 2.3.1

JARs added:
hadoop-aws-2.7.3.jar (in sync with the hadoop version used with spark)
aws-java-sdk-1.11.472.jar

Trying out the following code:

>>> sc=spark.sparkContext

>>> hadoop_conf=sc._jsc.hadoopConfiguration()

>>> hadoop_conf.set("fs.s3n.awsAccessKeyId", "abcd")

>>> hadoop_conf.set("fs.s3n.awsSecretAccessKey", "xyz123")

>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o33.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o67.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


Any help?

Thanks,
Aakash. 

 




If you reply to this email, your message will be added to the discussion below:
http://apache-spark-user-list.1001560.n3.nabble.com/Connection-issue-with-AWS-S3-from-PySpark-2-3-1-tp34215.html
To start a new topic under Apache Spark User List, email [hidden email]
To unsubscribe from Apache Spark User List, click here.
NAML


--
--Thanks,
Shuporno Choudhury


--
--Thanks,
Shuporno Choudhury



If you reply to this email, your message will be added to the discussion below:
http://apache-spark-user-list.1001560.n3.nabble.com/Connection-issue-with-AWS-S3-from-PySpark-2-3-1-tp34215p34217.html
To start a new topic under Apache Spark User List, email [hidden email]
To unsubscribe from Apache Spark User List, click here.
NAML


--
--Thanks,
Shuporno Choudhury
Reply | Threaded
Open this post in threaded view
|

Re: Connection issue with AWS S3 from PySpark 2.3.1

Aakash Basu-2
Hi Ricardo,

I am running in my local machine, that too, in a pyspark shell.

Starting shell with: pyspark --jars "hadoop-aws-2.7.3.jar,aws-java-sdk-1.11.472.jar"

Then submitting the following code:
sc = spark.sparkContext
hadoop_conf=sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.awsAccessKeyId", "abcd")
hadoop_conf.set("fs.s3a.awsSecretAccessKey", "123abc")
hadoop_conf.set("fs.s3a.endpoint", "us-east-1.amazonaws.com")
a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

I followed Shuporno's suggestion and added what he asked for:
"The correct ones probably are:
fs.s3a.access.key
fs.s3a.secret.key"

But, there was an error which said I should not be using these two configs, but should use what I was using before, hence I changed back to previous, which you can see in my code above.

I have put two of these jars in the jars and bin directory of the spark package which I downloaded and untarred and the bin of which is set in the bashrc so that it can be called and reused.

RM isn't YARN.

Please let me know, if you need any more info.

Thanks,
Aakash.

On Fri, Dec 21, 2018 at 6:03 PM Riccardo Ferrari <[hidden email]> wrote:
Hi Aakash,

Can you share how are you adding those jars? Are you using the package method ? I assume you're running in a cluster, and those dependencies might have not properly distributed.

How are you submitting your app? What kind of resource manager are you using standalone, yarn, ...

Best,

On Fri, Dec 21, 2018 at 1:18 PM Aakash Basu <[hidden email]> wrote:
Any help, anyone?

On Fri, Dec 21, 2018 at 2:21 PM Aakash Basu <[hidden email]> wrote:
Hey Shuporno,

With the updated config too, I am getting the same error. While trying to figure that out, I found this link which says I need aws-java-sdk (which I already have): https://github.com/amazon-archives/kinesis-storm-spout/issues/8

Now, this is my java details:

java version "1.8.0_181"

Java(TM) SE Runtime Environment (build 1.8.0_181-b13)

Java HotSpot(TM) 64-Bit Server VM (build 25.181-b13, mixed mode)



Is it due to some java version mismatch then or is it something else I am missing out? What do you think?


Thanks,
Aakash.

On Fri, Dec 21, 2018 at 1:43 PM Shuporno Choudhury <[hidden email]> wrote:
Hi,
I don't know whether the following config (that you have tried) are correct:
fs.s3a.awsAccessKeyId
fs.s3a.awsSecretAccessKey

The correct ones probably are:
fs.s3a.access.key
fs.s3a.secret.key

On Fri, 21 Dec 2018 at 13:21, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hey Shuporno,

Thanks for a prompt reply. Thanks for noticing the silly mistake, I tried this out, but still getting another error, which is related to connectivity it seems.

>>> hadoop_conf.set("fs.s3a.awsAccessKeyId", "abcd")
>>> hadoop_conf.set("fs.s3a.awsSecretAccessKey", "123abc")
>>> a = spark.read.csv("s3a:///test-bucket/breast-cancer-wisconsin.csv", header=True)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv
    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o220.csv.
: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 28 more


Thanks,
Aakash. 

On Fri, Dec 21, 2018 at 12:51 PM Shuporno Choudhury <[hidden email]> wrote:


On Fri, 21 Dec 2018 at 12:47, Shuporno Choudhury <[hidden email]> wrote:
Hi,
Your connection config uses 's3n' but your read command uses 's3a'.
The config for s3a are:
spark.hadoop.fs.s3a.access.key
spark.hadoop.fs.s3a.secret.key

I feel this should solve the problem.

On Fri, 21 Dec 2018 at 12:09, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hi,

I am trying to connect to AWS S3 and read a csv file (running POC) from a bucket.

I have s3cmd and and being able to run ls and other operation from cli.

Present Configuration:
Python 3.7
Spark 2.3.1

JARs added:
hadoop-aws-2.7.3.jar (in sync with the hadoop version used with spark)
aws-java-sdk-1.11.472.jar

Trying out the following code:

>>> sc=spark.sparkContext

>>> hadoop_conf=sc._jsc.hadoopConfiguration()

>>> hadoop_conf.set("fs.s3n.awsAccessKeyId", "abcd")

>>> hadoop_conf.set("fs.s3n.awsSecretAccessKey", "xyz123")

>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o33.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o67.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


Any help?

Thanks,
Aakash. 

 




If you reply to this email, your message will be added to the discussion below:
http://apache-spark-user-list.1001560.n3.nabble.com/Connection-issue-with-AWS-S3-from-PySpark-2-3-1-tp34215.html
To start a new topic under Apache Spark User List, email [hidden email]
To unsubscribe from Apache Spark User List, click here.
NAML


--
--Thanks,
Shuporno Choudhury


--
--Thanks,
Shuporno Choudhury



If you reply to this email, your message will be added to the discussion below:
http://apache-spark-user-list.1001560.n3.nabble.com/Connection-issue-with-AWS-S3-from-PySpark-2-3-1-tp34215p34217.html
To start a new topic under Apache Spark User List, email [hidden email]
To unsubscribe from Apache Spark User List, click here.
NAML


--
--Thanks,
Shuporno Choudhury
Reply | Threaded
Open this post in threaded view
|

Spark 2 - How to order keys in sparse vector (K-means)?

ddebarbieux
Dear all,

I am using Spark 2 in order to cluster data with the K-means algorithm.
My input data is flat and K-means requires sparse vectors with ordered
keys. Here is an example of an input and the expected output:

[id, key, value]
[1, 10, 100]
[1, 30, 300]
[2, 40, 400]
[1, 20, 200]

[id, list(key), list(value)]
[1, [10, 20, 30], [100, 200, 300]]
[2, [40], [400]]

The naive algorithm orders by key then groups by id and then aggregates
in a list is not working: the list is unorder!
Depending on how data is partitioned, Spark will append values to the
list as soon as it finds a row in the group. The order depends on how
Spark plans the aggregation over the executors.

Thanks to the web, I have more complex algorithm:

WindowSpec w = Window.partitionBy("id").orderBy("key");
ds.withColumn("collect_list(key)", collect_list("key").over(w))
    .withColumn("collect_list(value))", collect_list("value").over(w))
    .groupBy("id")
    .agg(
       max("collect_list(key)").alias("collect_list(key)"),
       max("collect_list(value)").alias("collect_list(value)"))

According to my test, it returns the expected result but it open 2
questions:

  1. Does exist an easiest way to do it?
  2. Why is-it working? Can I assume that all the data with the same id
is in the same partition? Since I use two times over(w), why the pairs
(key,value) are never mixted?

Thanks for your help.
Denis



---------------------------------------------------------------------
To unsubscribe e-mail: [hidden email]

Reply | Threaded
Open this post in threaded view
|

Re: Connection issue with AWS S3 from PySpark 2.3.1

Riccardo Ferrari
In reply to this post by Aakash Basu-2
ok, can you switch to the '--package' option instead of '--jars'.

In particular the 'jars' parameter only ships the specific jars references, what you actually need is all the transient dependecies as maanged by the 'package' option.
Inspecting the content of the aws-java-sdk.jar there is only a pom.xml and not a single class.

You need something like: pyspark --packages com.amazonaws:aws-java-sdk:1.11.472,org.apache.hadoop:hadoop-aws:2.7.3 ...

HTH,

On Fri, Dec 21, 2018 at 2:46 PM Aakash Basu <[hidden email]> wrote:
Hi Ricardo,

I am running in my local machine, that too, in a pyspark shell.

Starting shell with: pyspark --jars "hadoop-aws-2.7.3.jar,aws-java-sdk-1.11.472.jar"

Then submitting the following code:
sc = spark.sparkContext
hadoop_conf=sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.awsAccessKeyId", "abcd")
hadoop_conf.set("fs.s3a.awsSecretAccessKey", "123abc")
hadoop_conf.set("fs.s3a.endpoint", "us-east-1.amazonaws.com")
a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

I followed Shuporno's suggestion and added what he asked for:
"The correct ones probably are:
fs.s3a.access.key
fs.s3a.secret.key"

But, there was an error which said I should not be using these two configs, but should use what I was using before, hence I changed back to previous, which you can see in my code above.

I have put two of these jars in the jars and bin directory of the spark package which I downloaded and untarred and the bin of which is set in the bashrc so that it can be called and reused.

RM isn't YARN.

Please let me know, if you need any more info.

Thanks,
Aakash.

On Fri, Dec 21, 2018 at 6:03 PM Riccardo Ferrari <[hidden email]> wrote:
Hi Aakash,

Can you share how are you adding those jars? Are you using the package method ? I assume you're running in a cluster, and those dependencies might have not properly distributed.

How are you submitting your app? What kind of resource manager are you using standalone, yarn, ...

Best,

On Fri, Dec 21, 2018 at 1:18 PM Aakash Basu <[hidden email]> wrote:
Any help, anyone?

On Fri, Dec 21, 2018 at 2:21 PM Aakash Basu <[hidden email]> wrote:
Hey Shuporno,

With the updated config too, I am getting the same error. While trying to figure that out, I found this link which says I need aws-java-sdk (which I already have): https://github.com/amazon-archives/kinesis-storm-spout/issues/8

Now, this is my java details:

java version "1.8.0_181"

Java(TM) SE Runtime Environment (build 1.8.0_181-b13)

Java HotSpot(TM) 64-Bit Server VM (build 25.181-b13, mixed mode)



Is it due to some java version mismatch then or is it something else I am missing out? What do you think?


Thanks,
Aakash.

On Fri, Dec 21, 2018 at 1:43 PM Shuporno Choudhury <[hidden email]> wrote:
Hi,
I don't know whether the following config (that you have tried) are correct:
fs.s3a.awsAccessKeyId
fs.s3a.awsSecretAccessKey

The correct ones probably are:
fs.s3a.access.key
fs.s3a.secret.key

On Fri, 21 Dec 2018 at 13:21, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hey Shuporno,

Thanks for a prompt reply. Thanks for noticing the silly mistake, I tried this out, but still getting another error, which is related to connectivity it seems.

>>> hadoop_conf.set("fs.s3a.awsAccessKeyId", "abcd")
>>> hadoop_conf.set("fs.s3a.awsSecretAccessKey", "123abc")
>>> a = spark.read.csv("s3a:///test-bucket/breast-cancer-wisconsin.csv", header=True)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv
    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o220.csv.
: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 28 more


Thanks,
Aakash. 

On Fri, Dec 21, 2018 at 12:51 PM Shuporno Choudhury <[hidden email]> wrote:


On Fri, 21 Dec 2018 at 12:47, Shuporno Choudhury <[hidden email]> wrote:
Hi,
Your connection config uses 's3n' but your read command uses 's3a'.
The config for s3a are:
spark.hadoop.fs.s3a.access.key
spark.hadoop.fs.s3a.secret.key

I feel this should solve the problem.

On Fri, 21 Dec 2018 at 12:09, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hi,

I am trying to connect to AWS S3 and read a csv file (running POC) from a bucket.

I have s3cmd and and being able to run ls and other operation from cli.

Present Configuration:
Python 3.7
Spark 2.3.1

JARs added:
hadoop-aws-2.7.3.jar (in sync with the hadoop version used with spark)
aws-java-sdk-1.11.472.jar

Trying out the following code:

>>> sc=spark.sparkContext

>>> hadoop_conf=sc._jsc.hadoopConfiguration()

>>> hadoop_conf.set("fs.s3n.awsAccessKeyId", "abcd")

>>> hadoop_conf.set("fs.s3n.awsSecretAccessKey", "xyz123")

>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o33.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o67.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


Any help?

Thanks,
Aakash. 

 




If you reply to this email, your message will be added to the discussion below:
http://apache-spark-user-list.1001560.n3.nabble.com/Connection-issue-with-AWS-S3-from-PySpark-2-3-1-tp34215.html
To start a new topic under Apache Spark User List, email [hidden email]
To unsubscribe from Apache Spark User List, click here.
NAML


--
--Thanks,
Shuporno Choudhury


--
--Thanks,
Shuporno Choudhury



If you reply to this email, your message will be added to the discussion below:
http://apache-spark-user-list.1001560.n3.nabble.com/Connection-issue-with-AWS-S3-from-PySpark-2-3-1-tp34215p34217.html
To start a new topic under Apache Spark User List, email [hidden email]
To unsubscribe from Apache Spark User List, click here.
NAML


--
--Thanks,
Shuporno Choudhury
Reply | Threaded
Open this post in threaded view
|

Re: Connection issue with AWS S3 from PySpark 2.3.1

☼ R Nair (रविशंकर नायर)
PLease refer :


Minio is exactly like Amazon S3, only difference is that its on premise cloud storage. Initially I had  problems like you.. but I have detailed it. You don't  need to go through the pain again.

Best,
Ravi

On Fri, Dec 21, 2018 at 10:33 AM Riccardo Ferrari <[hidden email]> wrote:
ok, can you switch to the '--package' option instead of '--jars'.

In particular the 'jars' parameter only ships the specific jars references, what you actually need is all the transient dependecies as maanged by the 'package' option.
Inspecting the content of the aws-java-sdk.jar there is only a pom.xml and not a single class.

You need something like: pyspark --packages com.amazonaws:aws-java-sdk:1.11.472,org.apache.hadoop:hadoop-aws:2.7.3 ...

HTH,

On Fri, Dec 21, 2018 at 2:46 PM Aakash Basu <[hidden email]> wrote:
Hi Ricardo,

I am running in my local machine, that too, in a pyspark shell.

Starting shell with: pyspark --jars "hadoop-aws-2.7.3.jar,aws-java-sdk-1.11.472.jar"

Then submitting the following code:
sc = spark.sparkContext
hadoop_conf=sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.awsAccessKeyId", "abcd")
hadoop_conf.set("fs.s3a.awsSecretAccessKey", "123abc")
hadoop_conf.set("fs.s3a.endpoint", "us-east-1.amazonaws.com")
a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

I followed Shuporno's suggestion and added what he asked for:
"The correct ones probably are:
fs.s3a.access.key
fs.s3a.secret.key"

But, there was an error which said I should not be using these two configs, but should use what I was using before, hence I changed back to previous, which you can see in my code above.

I have put two of these jars in the jars and bin directory of the spark package which I downloaded and untarred and the bin of which is set in the bashrc so that it can be called and reused.

RM isn't YARN.

Please let me know, if you need any more info.

Thanks,
Aakash.

On Fri, Dec 21, 2018 at 6:03 PM Riccardo Ferrari <[hidden email]> wrote:
Hi Aakash,

Can you share how are you adding those jars? Are you using the package method ? I assume you're running in a cluster, and those dependencies might have not properly distributed.

How are you submitting your app? What kind of resource manager are you using standalone, yarn, ...

Best,

On Fri, Dec 21, 2018 at 1:18 PM Aakash Basu <[hidden email]> wrote:
Any help, anyone?

On Fri, Dec 21, 2018 at 2:21 PM Aakash Basu <[hidden email]> wrote:
Hey Shuporno,

With the updated config too, I am getting the same error. While trying to figure that out, I found this link which says I need aws-java-sdk (which I already have): https://github.com/amazon-archives/kinesis-storm-spout/issues/8

Now, this is my java details:

java version "1.8.0_181"

Java(TM) SE Runtime Environment (build 1.8.0_181-b13)

Java HotSpot(TM) 64-Bit Server VM (build 25.181-b13, mixed mode)



Is it due to some java version mismatch then or is it something else I am missing out? What do you think?


Thanks,
Aakash.

On Fri, Dec 21, 2018 at 1:43 PM Shuporno Choudhury <[hidden email]> wrote:
Hi,
I don't know whether the following config (that you have tried) are correct:
fs.s3a.awsAccessKeyId
fs.s3a.awsSecretAccessKey

The correct ones probably are:
fs.s3a.access.key
fs.s3a.secret.key

On Fri, 21 Dec 2018 at 13:21, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hey Shuporno,

Thanks for a prompt reply. Thanks for noticing the silly mistake, I tried this out, but still getting another error, which is related to connectivity it seems.

>>> hadoop_conf.set("fs.s3a.awsAccessKeyId", "abcd")
>>> hadoop_conf.set("fs.s3a.awsSecretAccessKey", "123abc")
>>> a = spark.read.csv("s3a:///test-bucket/breast-cancer-wisconsin.csv", header=True)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv
    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o220.csv.
: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)
at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)
at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)
at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 28 more


Thanks,
Aakash. 

On Fri, Dec 21, 2018 at 12:51 PM Shuporno Choudhury <[hidden email]> wrote:


On Fri, 21 Dec 2018 at 12:47, Shuporno Choudhury <[hidden email]> wrote:
Hi,
Your connection config uses 's3n' but your read command uses 's3a'.
The config for s3a are:
spark.hadoop.fs.s3a.access.key
spark.hadoop.fs.s3a.secret.key

I feel this should solve the problem.

On Fri, 21 Dec 2018 at 12:09, Aakash Basu-2 [via Apache Spark User List] <[hidden email]> wrote:
Hi,

I am trying to connect to AWS S3 and read a csv file (running POC) from a bucket.

I have s3cmd and and being able to run ls and other operation from cli.

Present Configuration:
Python 3.7
Spark 2.3.1

JARs added:
hadoop-aws-2.7.3.jar (in sync with the hadoop version used with spark)
aws-java-sdk-1.11.472.jar

Trying out the following code:

>>> sc=spark.sparkContext

>>> hadoop_conf=sc._jsc.hadoopConfiguration()

>>> hadoop_conf.set("fs.s3n.awsAccessKeyId", "abcd")

>>> hadoop_conf.set("fs.s3n.awsSecretAccessKey", "xyz123")

>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o33.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)

at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

at py4j.commands.CallCommand.execute(CallCommand.java:79)

at py4j.GatewayConnection.run(GatewayConnection.java:238)

at java.lang.Thread.run(Thread.java:748)

Caused by: java.lang.ClassNotFoundException: com.amazonaws.auth.AWSCredentialsProvider

at java.net.URLClassLoader.findClass(URLClassLoader.java:381)

at java.lang.ClassLoader.loadClass(ClassLoader.java:424)

at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)

at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

... 28 more


>>> a = spark.read.csv("s3a://test-bucket/breast-cancer-wisconsin.csv", header=True)

Traceback (most recent call last):

  File "<stdin>", line 1, in <module>

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 441, in csv

    return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco

    return f(*a, **kw)

  File "/Users/aakash/Downloads/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value

py4j.protocol.Py4JJavaError: An error occurred while calling o67.csv.

: java.lang.NoClassDefFoundError: com/amazonaws/auth/AWSCredentialsProvider

at java.lang.Class.forName0(Native Method)

at java.lang.Class.forName(Class.java:348)

at org.apache.hadoop.conf.Configuration.getClassByNameOrNull(Configuration.java:2134)

at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2099)

at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193)

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)

at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:45)

at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:354)

at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)

at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)

at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:596)

at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)

at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

at java.lang.reflect.Method.invoke(Method.java:498)

at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

at py4j.Gateway.invoke(Gateway.java:282)