distributed_systems_MIT/lec15

lec15

dz / distributed_systems_MIT / lec15

Summary

Lecture 15: Spark

Node Tree

spark

Nodes

spark
content	spark
children	HDFS, driver, fault_tolerance, mapreduce_successor, pagerank, spark_exec

mapreduce_successor
content	Successor to MapReduce
children	generalizes_mapreduce
parents	spark

generalizes_mapreduce
content	Generalizes map + reduce steps
parents	mapreduce_successor

pagerank
content	Pagerank algorithm
children	difficult_in_MR, estimates_page_importance (description), spark_exec (demo: pagerank implemented in Spark)
parents	spark

estimates_page_importance
content	Estimates importance of page
parents	pagerank

difficult_in_MR
content	difficult to implement in MapReduce
parents	pagerank

driver
content	driver: computer than runs program
parents	spark

spark_exec
content	How does spark execute?
children	cache, collect, distinct, exec_looks_like, group_by_key, join, map_values, persist_data, readfile, reduce_by_key
parents	pagerank, spark

distinct
content	distinct
children	info_all_workers (distinct is a wide operation)
parents	spark_exec, wide

readfile
content	Read File
children	doesnt_read, lineage_graph
parents	spark_exec

lineage_graph
content	lineage graph
children	doesnt_read, looks_at_lineage_graph
parents	readfile

doesnt_read
content	Doesn't initialy read, only produces lineage graph
children	doesnt_process_data (AKA)
parents	readfile, lineage_graph

join
content	join
parents	spark_exec

group_by_key
content	group-by ke
parents	spark_exec

collect
content	collect
parents	spark_exec

cache
content	cache
children	persist_data
parents	spark_exec

persist_data
content	Persist Data
parents	spark_exec, cache

reduce_by_key
content	reduce by key
parents	spark_exec

map_values
content	map values
parents	spark_exec

doesnt_process_data
content	doesn't process data
parents	doesnt_read

exec_looks_like
content	What does execution look like?
children	optimization, transformations
parents	spark_exec

transformations
content	Transformations
children	narrow, wide
parents	exec_looks_like

narrow
content	narrow
children	individual_workers, wide (vs)
parents	transformations

wide
content	wide
children	distinct (distinct is wide transformation), expensive
parents	transformations, narrow

individual_workers
content	Individual Workers
parents	narrow

expensive
content	Expensive
parents	wide

info_all_workers
content	Needs to know info from all workers
parents	distinct

optimization
content	Optimization
children	looks_at_lineage_graph
parents	exec_looks_like

looks_at_lineage_graph
content	Looks at lineage graph
parents	optimization, lineage_graph

fault_tolerance
content	spark
children	driveer_not_replicated, failed_worker_wide_deps, input_assumed_ft, not_bulletproof, tolerate_common_errors
parents	spark

tolerate_common_errors
content	Tolerate common errors
parents	fault_tolerance

HDFS
content	HDFS
children	input_assumed_ft
parents	spark

input_assumed_ft
content	Input assumed to be fault-tolerant via HDFS
parents	HDFS, fault_tolerance

not_bulletproof
content	Doesn't have to be bullet-proof
children	driveer_not_replicated (for example)
parents	fault_tolerance

driveer_not_replicated
content	Driver machine not replicated
parents	not_bulletproof, fault_tolerance

failed_worker_wide_deps
content	Failed Worker, Wide dependencies
children	recompute_days_worth
parents	fault_tolerance

recompute_days_worth
content	Can end up recomputing a days worth of computation
children	checkpoints (mitigations against)
parents	failed_worker_wide_deps

checkpoints
content	Checkpoints for specific transformation
parents	recompute_days_worth