csc-665-homework/hw3/heredity.py

import csv
import itertools
import sys

PROBS = {

    # Unconditional probabilities for having gene
    "gene": {
        2: 0.01,
        1: 0.03,
        0: 0.96
    },

    "trait": {

        # Probability of trait given two copies of gene
        2: {
            True: 0.65,
            False: 0.35
        },

        # Probability of trait given one copy of gene
        1: {
            True: 0.56,
            False: 0.44
        },

        # Probability of trait given no gene
        0: {
            True: 0.01,
            False: 0.99
        }
    },

    # Mutation probability
    "mutation": 0.01
}


def main():

    # Check for proper usage
    if len(sys.argv) != 2:
        sys.exit("Usage: python heredity.py data.csv")
    people = load_data(sys.argv[1])

    # Keep track of gene and trait probabilities for each person
    probabilities = {
        person: {
            "gene": {
                2: 0,
                1: 0,
                0: 0
            },
            "trait": {
                True: 0,
                False: 0
            }
        }
        for person in people
    }

    # Loop over all sets of people who might have the trait
    names = set(people)
    for have_trait in powerset(names):

        # Check if current set of people violates known information
        fails_evidence = any(
            (people[person]["trait"] is not None and
             people[person]["trait"] != (person in have_trait))
            for person in names
        )
        if fails_evidence:
            continue

        # Loop over all sets of people who might have the gene
        for one_gene in powerset(names):
            for two_genes in powerset(names - one_gene):

                # Update probabilities with new joint probability
                p = joint_probability(people, one_gene, two_genes, have_trait)
                update(probabilities, one_gene, two_genes, have_trait, p)

    # Ensure probabilities sum to 1
    normalize(probabilities)

    # Print results
    for person in people:
        print(f"{person}:")
        for field in probabilities[person]:
            print(f"  {field.capitalize()}:")
            for value in probabilities[person][field]:
                p = probabilities[person][field][value]
                print(f"    {value}: {p:.4f}")


def load_data(filename):
    """
    Load gene and trait data from a file into a dictionary.
    File assumed to be a CSV containing fields name, mother, father, trait.
    mother, father must both be blank, or both be valid names in the CSV.
    trait should be 0 or 1 if trait is known, blank otherwise.
    """
    data = dict()
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            name = row["name"]
            data[name] = {
                "name": name,
                "mother": row["mother"] or None,
                "father": row["father"] or None,
                "trait": (True if row["trait"] == "1" else
                          False if row["trait"] == "0" else None)
            }
    return data


def powerset(s):
    """
    Return a list of all possible subsets of set s.
    """
    s = list(s)
    return [
        set(s) for s in itertools.chain.from_iterable(
            itertools.combinations(s, r) for r in range(len(s) + 1)
        )
    ]


def joint_probability(people, one_gene, two_genes, have_trait):
    """
    Compute and return a joint probability.

    Args:
    - people is a dictionary of people. The keys represent names, and
      the values are dictionaries that contain mother and father keys.
      You may assume that either mother and father are both blank (no
      parental information in the data set), or mother and father will
      both refer to other people in the people dictionary.
    - one_gene is a set of all people for whom we want to compute the
      probability that they have one copy of the gene.
    - two_genes is a set of all people for whom we want to compute the
      probability that they have two copies of the gene.
    - have_trait is a set of all people for whom we want to compute the
      probability that they have the trait.

    The probability returned should be the probability that
    - everyone in set `one_gene` has one copy of the gene, and
    - everyone in set `two_genes` has two copies of the gene, and
    - everyone not in `one_gene` or `two_gene` does not have the gene, and
    - everyone in set `have_trait` has the trait, and
    - everyone not in set` have_trait` does not have the trait.

    and trait = {"Harry", "James"} should calculate the probability that
    Lily has zero copies of the gene, Harry has one copy of the gene,
    James has two copies of the gene, Harry exhibits the trait, James
    exhibits the trait, and Lily does not exhibit the trait.

    For anyone with no parents listed in the data set, use the probability
    distribution PROBS["gene"] to determine the probability that they have
    a particular number of the gene.

    For anyone with parents in the data set, each parent will pass one of
    their two genes on to their child randomly, and there is a
    PROBS["mutation"] chance that it mutates (goes from being the gene to not
    being the gene, or vice versa).

    Use the probability distribution PROBS["trait"] to compute the probability
    that a person does or does not have a particular trait.
    """
    probability = 1

    for person in people:
        # Check if the person has one copy of the gene
        if person in one_gene:
            # Calculate the probability of having one copy of the gene
            gene_prob = PROBS["gene"][1]
        # Check if the person has two copies of the gene
        elif person in two_genes:
            # Calculate the probability of having two copies of the gene
            gene_prob = PROBS["gene"][2]
        else:
            # Calculate the probability of not having the gene
            gene_prob = PROBS["gene"][0]

        # Check if the person has the trait
        if person in have_trait:
            # Calculate the probability of having the trait
            trait_prob = PROBS["trait"][gene_prob][True]
        else:
            # Calculate the probability of not having the trait
            trait_prob = PROBS["trait"][gene_prob][False]

        # Multiply the gene and trait probabilities
        probability *= gene_prob * trait_prob

    return probability


def update(probabilities, one_gene, two_genes, have_trait, p):
    """
    Add to `probabilities` a new joint probability `p`.
    Each person should have their "gene" and "trait" distributions updated.
    Which value for each distribution is updated depends on whether
    the person is in `have_gene` and `have_trait`, respectively.

    Args:
    - probabilities is a dictionary of people. Each person is mapped to a
      "gene" distribution and a "trait" distribution.
    - one_gene is a set of people with one copy of the gene in the current
      joint distribution.
    - two_genes is a set of people with two copies of the gene in the
      current joint distribution.
    - have_trait is a set of people with the trait in the current joint
      distribution.
    - p is the probability of the joint distribution.

    For each person person in `probabilities`, the function should update
    the probabilities[person]["gene"] distribution and probabilities[person]["trait"]
    distribution by adding p to the appropriate value in each distribution.
    All other values should be left unchanged.

    For example, if "Harry" were in both two_genes and in have_trait, then p would
    be added to probabilities["Harry"]["gene"][2] and to
    probabilities["Harry"]["trait"][True].

    The function should not return any value: it just needs to update the
    probabilities dictionary.
    """

    for person in probabilities:
        if person in one_gene:
            probabilities[person]["gene"][1] += p
        elif person in two_genes:
            probabilities[person]["gene"][2] += p
        else:
            probabilities[person]["gene"][0] += p

        if person in have_trait:
            probabilities[person]["trait"][True] += p
        else:
            probabilities[person]["trait"][False] += p


def normalize(probabilities):
    """
    Update `probabilities` such that each probability distribution
    is normalized (i.e., sums to 1, with relative proportions the same).

    Args:
    - probabilities is a dictionary of people. Each person is mapped to a
      "gene" distribution and a "trait" distribution.

    For both of the distributions for each person in probabilities, this
    function should normalize that distribution so that the values in the
    distribution sum to 1, and the relative values in the distribution are the same.

    For example, if probabilities["Harry"]["trait"][True] were equal to 0.1 and
    probabilities["Harry"]["trait"][False] were equal to 0.3, then your function
    should update the former value to be 0.25 and the latter value to be 0.75: the
    numbers now sum to 1, and the latter value is still three times larger than
    the former value.

    The function should not return any value: it just needs to update the
    probabilities dictionary.
    """

    for person in probabilities:
        # Calculate the sum of the gene probabilities
        gene_sum = sum(probabilities[person]["gene"].values())
        # Calculate the sum of the trait probabilities
        trait_sum = sum(probabilities[person]["trait"].values())

        # Normalize the gene probabilities
        for gene in probabilities[person]["gene"]:
            probabilities[person]["gene"][gene] /= gene_sum

        # Normalize the trait probabilities
        for trait in probabilities[person]["trait"]:
            probabilities[person]["trait"][trait] /= trait_sum


if __name__ == "__main__":
    main()