Listings Winkler/Scholz, Reddit Teil 2

Listing 1: Anzahl der Autoren bestimmen
-- Anzahl der Autoren von Initialposts
pd.read_sql("SELECT COUNT(DISTINCT author) FROM posts2020 WHERE parent_id IS NULL", sql)

-- Anzahl der Autoren, die nur Initialposts verfasst haben
pd.read_sql("SELECT COUNT(DISTINCT author) FROM posts2020 WHERE parent_id IS NULL AND author NOT IN (SELECT DISTINCT author FROM posts2020 WHERE parent_id IS NOT NULL)", sql)

-- Anzahl der Autoren von Kommentaren
pd.read_sql("SELECT COUNT(DISTINCT author) FROM posts2020 WHERE parent_id IS NOT NULL", sql)

-- Anzahl der Autoren, die nur Kommentaren verfasst haben
pd.read_sql("SELECT COUNT(DISTINCT author) FROM posts2020 WHERE parent_id IS NOT NULL AND author NOT IN (SELECT DISTINCT author FROM posts2020 WHERE parent_id IS NULL)", sql)

-- Anzahl der Autoren, die Initialposts und Kommentare verfasst haben
pd.read_sql("SELECT COUNT(DISTINCT author) FROM posts2020 WHERE parent_id IS NOT NULL AND author IN (SELECT DISTINCT author FROM posts2020 WHERE parent_id IS NULL)", sql)

-----

Listing 2: Die Kanten des Graphen ausfindig machen
ar = pd.read_sql("with linear as \
    (select parent_id, \
      group_concat(author) as response_authors, \
      group_concat(created_utc) as timestamps, \
      group_concat(score) as score \
    from posts2020 group by parent_id) \
      select p.author as parent_author, response_authors \
        from posts2020 p, linear l \
        where p.id=l.parent_id", sql)

-----

Listing 3: Ein Counter misst, wie oft Nutzer kommunizieren
from collections import Counter
edges = Counter()
for i, a in tqdm(ar.iterrows(), total=len(ar)):
    target = a["parent_author"]
    if target in top_authors:
      for source in a["response_authors"].split(","):
        if source in top_authors:
          edges["#".join([source, target])] += 1

-----

Listing 4: Knoten ergänzen
import numpy as np
nodes = {}
node_number = {}
for i, n in enumerate(np.unique([node for edge in edges for node in edge.split("#")])):
    nodes[n] = g.add_vertex()
    node_names[nodes[n]] = n
    node_number[i] = n
g.vertex_properties["name"] = node_names

-----

Listing 5: Dem Graphen Kanten zuordnen
edge_weights = g.new_edge_property("int")
edge_width = g.new_edge_property("double")

max_weight = edges.most_common(1)[0][1]
for edge in edges:
    (source, target) = edge.split("#")
    e = g.add_edge(nodes[source], nodes[target])
    edge_weights[e] = edges[edge]
    edge_width[e] = 5*edges[edge]/max_weight

g.edge_properties["weight"] = edge_weights
g.edge_properties["width"] = edge_width

-----

Listing 6: Einen Graphen mit dem Algorithmus sfdp-Layout visualisieren
import matplotlib
pos = sfdp_layout(g)
graph_draw(g, pos, output_size=(1000, 1000), vertex_color=[1,1,1,0],
           vertex_size=1, edge_pen_width=edge_width, # 0.2
           vcmap=matplotlib.cm.gist_heat_r)

-----

Listing 7: Mit graph-tool den PageRank berechnen
pr = pagerank(g, weight=edge_weights)
graph_draw(g, pos, output_size=(1000, 1000), vertex_fill_color=pr,
           edge_pen_width=edge_width, #0.2,
           vertex_text=node_names, vertex_text_position=0.0,
           vertex_size=prop_to_size(pr, mi=5, ma=30))


