|
PySpark Plaso
Release 2019
A tool for distributed extraction of timestamps from various files using extractors adapted from the Plaso engine to Apache Spark.
|


Public Member Functions | |
| def | __init__ (self, spark_context) |
| def | open_filesystem (self, hdfs_uri, user="hadoop") |
| def | close_filesystem (self, filesystem=None) |
| def | make_path (self, path_string, qualified=True, filesystem=None) |
| def | append_to_path (self, original_path, new_child_string) |
| def | list_files (self, path_string, recursion_level=0, include_dir_names=False, filesystem=None) |
| def | open_inputstream (self, path, filesystem=None) |
| def | close_stream (self, input_stream) |
| def | get_stream_offset (self, input_stream) |
| def | get_path_size (self, path, filesystem=None) |
| def | read_inputstream (self, input_stream, size=sys.maxsize) |
| def | read_inputstream_with_chunk (self, input_stream, size=sys.maxsize, chunk_size=2048) |
| def | seek_stream (self, stream, offset, whence=os.SEEK_SET) |
| def | seek_stream_with_path (self, stream, offset, whence=os.SEEK_SET, path=None) |
| def | get_filesystem (self, force_filesystem) |
| def | exists (self, path_string, filesystem) |
| def | info (self, path_string, filesystem=None) |
| def | remove (self, path_string, recursive=True, filesystem=None) |
| def | mkdir (self, path_string, filesystem=None) |
| def | open_outputstream (self, path, filesystem=None) |
| def | write_outputstream (self, output_stream, data) |
Public Member Functions inherited from plaso.tarzan.lib.hdfs.Hdfs | |
| def | make_uri (self, filesystem=None) |
| def | make_simple_path (self, path_string, filesystem=None) |
| def | make_qualified_path (self, path_string, filesystem=None) |
| def | basename (self, path_string) |
| def | dirname (self, path_string) |
Public Attributes | |
| spark_context | |
| uri_class | |
| path_class | |
| fs_class | |
| isr_class | |
| fs | |
Additional Inherited Members | |
Static Public Member Functions inherited from plaso.tarzan.lib.hdfs.Hdfs | |
| def | parse_uri (hdfs_uri) |
Static Public Attributes inherited from plaso.tarzan.lib.hdfs.Hdfs | |
| string | PATH_SEPARATOR = '/' |
| string | SCHEME = 'hdfs' |
HDFS driver utilizing JVM gateway of the Spark Context.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.__init__ | ( | self, | |
| spark_context | |||
| ) |
Initialize the driver. :param spark_context: the Spark Context
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.append_to_path | ( | self, | |
| original_path, | |||
| new_child_string | |||
| ) |
Append a path/directory/file into another HDFS path. :param original_path: the another path :param new_child_string: the path to append :return: the resulting path
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.close_filesystem | ( | self, | |
filesystem = None |
|||
| ) |
Close a given HDFS filesystem. :param filesystem: the filesystem
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.close_stream | ( | self, | |
| input_stream | |||
| ) |
Close a given (previously opened) input-stream for a HDFS file. :param input_stream: the opened input-stream
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.exists | ( | self, | |
| path_string, | |||
| filesystem | |||
| ) |
Check if a given HDFS path exists in the given HDFS filesystem. :param path_string: the path :param filesystem: the filesystem :return: True if the path exists in the filesystem, False otherwise
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.get_filesystem | ( | self, | |
| force_filesystem | |||
| ) |
Get a given or a default HDFS filesystem. :param force_filesystem: the given filesystem :return: the filesystem
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.get_path_size | ( | self, | |
| path, | |||
filesystem = None |
|||
| ) |
Get the size of a given HDFS path (a file) in a given filesystem. :param path: the path :param filesystem: the filesystem :return: the size
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.get_stream_offset | ( | self, | |
| input_stream | |||
| ) |
Get the current position (an offset) in a given (previously opened) input-stream for a HDFS file. :param input_stream: the opened input-stream :return: the position/offset
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.info | ( | self, | |
| path_string, | |||
filesystem = None |
|||
| ) |
Get metadata of a given path in a given or a default filesystem. :param path_string: the path :param filesystem: the filesystem :return: the metadata dictionary
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.list_files | ( | self, | |
| path_string, | |||
recursion_level = 0, |
|||
include_dir_names = False, |
|||
filesystem = None |
|||
| ) |
Get a list of files (optionally recursively to the specified level) in a given HDFS path of a given filesystem. :param path_string: the path :param recursion_level: the level for the recursion (0 is just a current directory without any recursion) :param include_dir_names: True to include also names of directories (suffixed by /) :param filesystem: the filesystem :return: the list of files in the path
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.make_path | ( | self, | |
| path_string, | |||
qualified = True, |
|||
filesystem = None |
|||
| ) |
Get a (qualified) HDFS URI from a given path and a given or a default filesystem. :param path_string: the path :param qualified: True to get the qualified path :param filesystem: the filesystem :return: the resulting path
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.mkdir | ( | self, | |
| path_string, | |||
filesystem = None |
|||
| ) |
Make a new directory (if does not exist) of a given path in a given or a default filesystem. :param path_string: the path :param filesystem: the filesystem
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.open_filesystem | ( | self, | |
| hdfs_uri, | |||
user = "hadoop" |
|||
| ) |
Open HDFS filesystem of a given URI and as a given HDFS user. :param hdfs_uri: HDFS URI to open :param user: HDFS user to act as when opening :return: the opened filesystem as an object of org.apache.hadoop.fs.FileSystem class
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.open_inputstream | ( | self, | |
| path, | |||
filesystem = None |
|||
| ) |
Open and get an input-stream for a HDFS file given by its path in a given filesystem. :param path: the path :param filesystem: the filesystem :return: the opened input-stream
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.open_outputstream | ( | self, | |
| path, | |||
filesystem = None |
|||
| ) |
Open and get an output-stream for a HDFS file given by its path in a given filesystem. :param path: the path :param filesystem: the filesystem :return: the opened output-stream
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.read_inputstream | ( | self, | |
| input_stream, | |||
size = sys.maxsize |
|||
| ) |
Read data form a given opened input stream for a HDFS file. :param input_stream: the input-stream :param size: the size of data to read :return: the data
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.read_inputstream_with_chunk | ( | self, | |
| input_stream, | |||
size = sys.maxsize, |
|||
chunk_size = 2048 |
|||
| ) |
Read data form a given opened input stream for a HDFS file. :param input_stream: the input-stream :param size: the size of data to read :param chunk_size: the size of a chunk :return: the data
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.remove | ( | self, | |
| path_string, | |||
recursive = True, |
|||
filesystem = None |
|||
| ) |
Remove (optionally recursively) a HDFS file/directory given by its path in a given or a default filesystem. :param path_string: the path :param recursive: True to remove recursively :param filesystem: the filesystem
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.seek_stream | ( | self, | |
| stream, | |||
| offset, | |||
whence = os.SEEK_SET |
|||
| ) |
Set a given position (an offset) in a given (previously opened) input-stream for a HDFS file. :param stream: the opened stream :param offset: the position/offset :param whence: the direction
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.seek_stream_with_path | ( | self, | |
| stream, | |||
| offset, | |||
whence = os.SEEK_SET, |
|||
path = None |
|||
| ) |
Set a given position (an offset) in a given (previously opened) input-stream for a HDFS file. :param stream: the opened stream :param offset: the position/offset :param whence: the direction :param path: the path of the file required to be able to seek from the end of the file
| def plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.write_outputstream | ( | self, | |
| output_stream, | |||
| data | |||
| ) |
Write data buffer to a given opened output stream for a HDFS file. :param output_stream: the output-stream :param data: the data buffer to write :return: the number of bytes written
Reimplemented from plaso.tarzan.lib.hdfs.Hdfs.
| plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.fs |
| plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.fs_class |
| plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.isr_class |
| plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.path_class |
| plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.spark_context |
| plaso.tarzan.lib.pyspark_hdfs.PySparkHdfs.uri_class |
1.8.15