bash:extracteur_statusnet
Ceci est une ancienne révision du document !
Extracteur StatusNet
Usage: téléchargez, chmod +x, exécutez en Terminal.
- Demande instance et compte
- récupération page, affichage nombre de status, nombre de pages
- demande de confirmation
- création dossier “temppages”, wget des pages dedans
- “one-line” des pages, création dossier “tempstatus”
- extraction des statuts (1 fichier par statut)
- traitement des fichiers de statut
- sortie des statuts en “nom de compte.csv” (id,date,texte,client,contexte)
- suppression des dossiers temppages et tempstatus
Testé opérationnel avec:
- identi.ca/gatitac
- status.jbfavre.org
- status.ndn.cx/gordontesos
- statusextract
#!/bin/bash read -p "Instance et nom d'utilisateur (exemple: identi.ca/gatitac): " account nombre=`wget -nv "http://$account" -O - | sed ':a;N;$!ba;s/\n/ /g' | awk -F '<dl class="entity_notices">' '{print $2}' | awk -F '<dd>' '{print $2}' | awk -F '<' '{print $1}'`; echo "****************************" echo "Nombre de statuts: $nombre" pages=$(($nombre/20+1)) echo "Ça fait $pages pages à récupérer." read -p "On continue chef ? ENTRÉE pour oui." mkdir -p "temppages/" cd "temppages/" e=1 mpages=$(($pages+1)) while [ $e -lt $mpages ] do wget -nv "http://$account?page=$e" -O "page$e.html" e=$(($e + 1)) done # on enlève ce qui est inutile (on garde juste la liste des statuts) for file in *; do cat "$file" | sed ':a;N;$!ba;s/\n/ /g' | awk -F '<ol class="notices xoxo">' '{print $2}' | awk -F '</ol>' '{print $1}' >> "_$file"; rm "$file"; mv "_$file" "$file"; done cd .. mkdir -p "tempstatus/" cd "temppages/" echo "********************" echo "Extraction des statuts" echo "********************" echo "" for file in *; do #traitement dent par dent dent1=`cat "$file" | awk -F '<li' '{print $2}' | awk -F '</li>' '{print $1}'` dent2=`cat "$file" | awk -F '<li' '{print $3}' | awk -F '</li>' '{print $1}'` dent3=`cat "$file" | awk -F '<li' '{print $4}' | awk -F '</li>' '{print $1}'` dent4=`cat "$file" | awk -F '<li' '{print $5}' | awk -F '</li>' '{print $1}'` dent5=`cat "$file" | awk -F '<li' '{print $6}' | awk -F '</li>' '{print $1}'` dent6=`cat "$file" | awk -F '<li' '{print $7}' | awk -F '</li>' '{print $1}'` dent7=`cat "$file" | awk -F '<li' '{print $8}' | awk -F '</li>' '{print $1}'` dent8=`cat "$file" | awk -F '<li' '{print $9}' | awk -F '</li>' '{print $1}'` dent9=`cat "$file" | awk -F '<li' '{print $10}' | awk -F '</li>' '{print $1}'` dent10=`cat "$file" | awk -F '<li' '{print $11}' | awk -F '</li>' '{print $1}'` dent11=`cat "$file" | awk -F '<li' '{print $12}' | awk -F '</li>' '{print $1}'` dent12=`cat "$file" | awk -F '<li' '{print $13}' | awk -F '</li>' '{print $1}'` dent13=`cat "$file" | awk -F '<li' '{print $14}' | awk -F '</li>' '{print $1}'` dent14=`cat "$file" | awk -F '<li' '{print $15}' | awk -F '</li>' '{print $1}'` dent15=`cat "$file" | awk -F '<li' '{print $16}' | awk -F '</li>' '{print $1}'` dent16=`cat "$file" | awk -F '<li' '{print $17}' | awk -F '</li>' '{print $1}'` dent17=`cat "$file" | awk -F '<li' '{print $18}' | awk -F '</li>' '{print $1}'` dent18=`cat "$file" | awk -F '<li' '{print $19}' | awk -F '</li>' '{print $1}'` dent19=`cat "$file" | awk -F '<li' '{print $20}' | awk -F '</li>' '{print $1}'` dent20=`cat "$file" | awk -F '<li' '{print $21}' | awk -F '</li>' '{print $1}'` dentid1=`echo $dent1 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid2=`echo $dent2 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid3=`echo $dent3 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid4=`echo $dent4 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid5=`echo $dent5 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid6=`echo $dent6 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid7=`echo $dent7 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid8=`echo $dent8 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid9=`echo $dent9 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid10=`echo $dent10 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid11=`echo $dent11 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid12=`echo $dent12 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid13=`echo $dent13 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid14=`echo $dent14 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid15=`echo $dent15 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid16=`echo $dent16 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid17=`echo $dent17 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid18=`echo $dent18 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid19=`echo $dent19 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` dentid20=`echo $dent20 | awk -F 'id="notice-' '{print $2}' | awk -F '">' '{print $1}'` # détail: le printf sert à garder tout bien dans le bon ordre, jusqu'à 10 décimales echo $dent1 >> "../tempstatus/`printf "%010d" $dentid1`" echo $dent2 >> "../tempstatus/`printf "%010d" $dentid2`" echo $dent3 >> "../tempstatus/`printf "%010d" $dentid3`" echo $dent4 >> "../tempstatus/`printf "%010d" $dentid4`" echo $dent5 >> "../tempstatus/`printf "%010d" $dentid5`" echo $dent6 >> "../tempstatus/`printf "%010d" $dentid6`" echo $dent7 >> "../tempstatus/`printf "%010d" $dentid7`" echo $dent8 >> "../tempstatus/`printf "%010d" $dentid8`" echo $dent9 >> "../tempstatus/`printf "%010d" $dentid9`" echo $dent10 >> "../tempstatus/`printf "%010d" $dentid10`" echo $dent11 >> "../tempstatus/`printf "%010d" $dentid11`" echo $dent12 >> "../tempstatus/`printf "%010d" $dentid12`" echo $dent13 >> "../tempstatus/`printf "%010d" $dentid13`" echo $dent14 >> "../tempstatus/`printf "%010d" $dentid14`" echo $dent15 >> "../tempstatus/`printf "%010d" $dentid15`" echo $dent16 >> "../tempstatus/`printf "%010d" $dentid16`" echo $dent17 >> "../tempstatus/`printf "%010d" $dentid17`" echo $dent18 >> "../tempstatus/`printf "%010d" $dentid18`" echo $dent19 >> "../tempstatus/`printf "%010d" $dentid19`" echo $dent20 >> "../tempstatus/`printf "%010d" $dentid20`" done cd .. cd "tempstatus/" echo "********************" echo "Traitement des statuts" echo "********************" echo "" output=`echo $account | sed -e 's/\./-/g' | sed -e 's/\//_/g'` for file in *; do # dent texte denttexte=`cat "$file" | awk -F '<p class="entry-content">' '{print $2}' | awk -F '</p>' '{print $1}' | sed -e 's/<[^>]*>//g'` # dent date dentdate=`cat "$file" | awk -F '<abbr class="published" title="' '{print $2}' | awk -F '">' '{print $1}'` # dent device dentdevice=`cat "$file" | awk -F '<span class="device">' '{print $2}' | awk -F '</span>' '{print $1}' | sed -e 's/<[^>]*>//g' | sed 's/ //g'` # dent context dentcontexte=`cat "$file" | awk -F '<a href="http://identi.ca/conversation' '{print $2}' | awk -F '" class="response">' '{print $1}' | sed 's/\//http:\/\/identi.ca\/conversation\//'` echo "\"$file\",\"$dentdate\",\"$denttexte\",\"$dentdevice\",\"$dentcontexte\"" >> "../$output.csv" done cd .. echo "Nettoyage.." rm -rf "tempstatus/" rm -rf "temppages/"
bash/extracteur_statusnet.1343990255.txt.gz · Dernière modification : 2013-02-19 20:28 (modification externe)